Better handling of UTF-8 subject lines & shorten file names

The UTF-8 strings, when not decoded from the MIME header in the 'Subject' field, were making very long file names. Fix both the UTF-8 strings to actually be decoded (which makes them shorter anyway when processed here), but also set a 128 character limit on the amount of the subject line of emails that go into the filenames.
2023-06-03 07:08:53 -07:00 · 2023-06-03 07:08:53 -07:00 · 8fc1f5371f
commit 8fc1f5371f
parent 111f569b9c
1 changed files with 3 additions and 1 deletions
--- a/build-label-upi-number.plx
+++ b/build-label-upi-number.plx
@ -25,6 +25,7 @@ use Email::Address::XS;
 use File::Copy;
 use Date::Manip::DM6 qw(ParseDate UnixDate);
 use Text::CSV;  # libtext-csv-perl in Debian
+use Encode qw/encode decode/;

 my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGE', privileged => 'PRIVILEGE',
                           'journalist-privilege' => 'PRIVILEGE' );
@ -121,7 +122,7 @@ sub ProcessMailDir($$$) {
            $parsed{Date} = ParseDate($item);
            die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/;
          } else {
-            $parsed{$fieldName} = $item;
+            $parsed{$fieldName} = decode('MIME-Header', $item);
          }
        }
      }
@ -129,6 +130,7 @@ sub ProcessMailDir($$$) {
      $parsed{Subject} =~ s/[\r\n]/ /gm;
      my $subjectDashes = $parsed{Subject};
      $subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g;
+      $subjectDashes = substr($subjectDashes, 0, 127);
      my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
                     UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
      die "$fileName has no subject" if not defined $parsed{Subject};