Better handling of UTF-8 subject lines & shorten file names

The UTF-8 strings, when not decoded from the MIME header in the
'Subject' field, were making very long file names.  Fix both the
UTF-8 strings to actually be decoded (which makes them shorter anyway
when processed here), but also set a 128 character limit on the
amount of the subject line of emails that go into the filenames.
This commit is contained in:
Bradley M. Kuhn 2023-06-03 07:08:53 -07:00
parent 111f569b9c
commit 8fc1f5371f

View file

@ -25,6 +25,7 @@ use Email::Address::XS;
use File::Copy;
use Date::Manip::DM6 qw(ParseDate UnixDate);
use Text::CSV; # libtext-csv-perl in Debian
use Encode qw/encode decode/;
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGE', privileged => 'PRIVILEGE',
'journalist-privilege' => 'PRIVILEGE' );
@ -121,7 +122,7 @@ sub ProcessMailDir($$$) {
$parsed{Date} = ParseDate($item);
die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/;
} else {
$parsed{$fieldName} = $item;
$parsed{$fieldName} = decode('MIME-Header', $item);
}
}
}
@ -129,6 +130,7 @@ sub ProcessMailDir($$$) {
$parsed{Subject} =~ s/[\r\n]/ /gm;
my $subjectDashes = $parsed{Subject};
$subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g;
$subjectDashes = substr($subjectDashes, 0, 127);
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
die "$fileName has no subject" if not defined $parsed{Subject};