Compare commits

...

10 commits

Author SHA1 Message Date
Bradley M. Kuhn
3f4a63dd3f Non-email production files should be entered in spreadsheet output
This was simply an oversight when I wrote ProcessDocumentDirectory()
2023-06-03 11:13:03 -07:00
Bradley M. Kuhn
db9a80723f Rework address code to properly catch when multiple address appear
As previously written, only the first address was found.  The key is
that the Email::Address::XS library was misused previously.  Based on
what is returned by Mail::Header->header_hashref(), it's clear that
we should use Email::Address::XS->parse_email_groups() first to
extract all addresses first.
2023-06-03 11:01:14 -07:00
Bradley M. Kuhn
347e0d3113 The Mail::Header library apparently returns CC, never Cc. 2023-06-03 10:14:40 -07:00
Bradley M. Kuhn
cdcf26f8fa Correct UnixDate() call.
Arguments were out of order, first of all, but also we might as well
add a check to verify that if it's empty and put "N/A" there if
nothing is there.
2023-06-03 09:54:37 -07:00
Bradley M. Kuhn
ea04654b7f Convert copied text files to DOS \r\n format relying on Perl's -T
While it may not be ideal here to rely on Perl's -T, since the
`man perlfunc` says that -T uses heuristic, and maybe the better
option would be to call `/usr/bin/file` and parse its output to see
if such conversion is needed, this solution should be close enough.
2023-06-03 08:40:28 -07:00
Bradley M. Kuhn
3f0716c9f0 Improve move/copy/make_path error handling; switch move() ⇒ rename()
It turns out that File::Copy->move() does *not* follow POSIX mv's
semantics when doing `mv DIRECTORY_1 DIRECTORY_2`.  It's quite clear
that it will do somethine like `mv DIRECTORY_1/* DIRECTORY_2`.

As such, while I'd prefer not to use the system-dependent rename()
Perl function here, that has the semantics I want.

In the process, error handling for the clals to move(), copy() and
make_path are improved.  I thought autodie was catching these, but
it's not.
2023-06-03 08:40:18 -07:00
Bradley M. Kuhn
9da387af88 Numbering, moving, and copying for non-email files completed. 2023-06-03 07:56:39 -07:00
Bradley M. Kuhn
c2cadeaa6a Create function for generating next UPI number. 2023-06-03 07:55:53 -07:00
Bradley M. Kuhn
bf0fd2ab75 Correct designation output.
"PRIVILEGED" not "PRIVILEGE"

Make sure designation is in CSV file properly.
2023-06-03 07:54:59 -07:00
Bradley M. Kuhn
34b6ba4cb8 Delete stray note. 2023-06-03 07:54:27 -07:00

View file

@ -21,14 +21,14 @@ use File::Spec::Functions;
use File::Spec; use File::Spec;
use File::Path qw(make_path); use File::Path qw(make_path);
use Mail::Header; use Mail::Header;
use Email::Address::XS; use Email::Address::XS qw(parse_email_groups);
use File::Copy; use File::Copy;
use Date::Manip::DM6 qw(ParseDate UnixDate); use Date::Manip::DM6 qw(ParseDate UnixDate);
use Text::CSV; # libtext-csv-perl in Debian use Text::CSV; # libtext-csv-perl in Debian
use Encode qw/encode decode/; use Encode qw/encode decode/;
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGE', privileged => 'PRIVILEGE', my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGED', privileged => 'PRIVILEGED',
'journalist-privilege' => 'PRIVILEGE' ); 'journalist-privilege' => 'PRIVILEGED' );
sub UsageAndExit($) { sub UsageAndExit($) {
print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n"; print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n";
@ -70,6 +70,7 @@ close $upiFH;
UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0; UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0;
my $upiCurrentNum = $upiStart; my $upiCurrentNum = $upiStart;
sub NextUPI () { return sprintf("UPI-SFC-%07d", $upiCurrentNum++); }
my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1}); my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1});
my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv")); my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv"));
@ -79,7 +80,7 @@ UsageAndExit("\"$csvLogFile\" cannot already exist! Do not attempt to number th
my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE', my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE',
'PROTECTIVE ORDER CATEGORY'); 'PROTECTIVE ORDER CATEGORY');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') { if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
@headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS', @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS',
'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS', 'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS',
'PRIVILEGE CLAIMED'); 'PRIVILEGE CLAIMED');
@ -99,7 +100,24 @@ sub ProcessDocumentDirectory($$$) {
if (-d $fullFilePath) { if (-d $fullFilePath) {
ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file)); ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file));
} elsif (-f $fullFilePath) { } elsif (-f $fullFilePath) {
print " mv $fullFilePath ", catfile($numberedOutputDir, $file), "\n"; my $upiFull = NextUPI();
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to make directory $numberedOutputDir: $!";
}
my($volume, $directories, $bareFileName) = File::Spec->splitpath($fullFilePath);
die("Something wrong, since file name is empty on $fullFilePath") unless defined $bareFileName and $bareFileName !~ /^\s*$/;
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . $bareFileName;
my $copiedFile = catfile($numberedOutputDir, $fileName);
copy($fullFilePath, $copiedFile)
or die "unable to copy($fullFilePath, catfile($numberedOutputDir, $fileName))";
system('/usr/bin/unix2dos', '-q', $copiedFile) if (-T $copiedFile);
die "unable to copy $fullFilePath to $copiedFile" unless -f $copiedFile;
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
push(@CSV_OUTPUT_ROWS, [ $upiFull, "", "", $fileName, "N/A", "", "", "", "", "", $GROUP ]);
} else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
}
} else { } else {
die("\"$fullFilePath\" is a strange file type, not handled!"); die("\"$fullFilePath\" is a strange file type, not handled!");
} }
@ -121,23 +139,41 @@ sub ProcessMailDir($$$) {
next if -d $file; # skip directories next if -d $file; # skip directories
my $msgFile = catfile($dir, $file); my $msgFile = catfile($dir, $file);
open(my $msgFH, "<", $msgFile); open(my $msgFH, "<", $msgFile);
my $upiFull = sprintf("UPI-SFC-%07d", $upiCurrentNum++); my $upiFull = NextUPI();
my $header = new Mail::Header($msgFH); my $header = new Mail::Header($msgFH);
my $fields = $header->header_hashref; my $fields = $header->header_hashref;
my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CcName => '', CcAddr => '', 'Subject' => '', my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CCName => '', CCAddr => '', 'Subject' => '',
Date => ''); Date => '');
foreach my $fieldName (qw/From To Cc Subject Date/) { use Data::Dumper;
foreach my $fieldName (qw/From To CC Cc Subject Date/) {
foreach my $item (@{$fields->{$fieldName}}) { foreach my $item (@{$fields->{$fieldName}}) {
chomp $item; chomp $item;
if ($fieldName =~ /From|To|Cc/) { if ($fieldName =~ /From|To|CC/i) {
my $addr = Email::Address::XS->parse($item); my @groups = parse_email_groups($item);
if ($addr->name ne "") { while ( my($groupName, $addrListRef) = each @groups) {
if (defined $groupName and $groupName !~ /^[01\s*]$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/; $parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $addr->name; $parsed{"${fieldName}Name"} .= $groupName;
} }
if ($addr->address ne "") { if (not ref $addrListRef) {
if (defined $addrListRef and $addrListRef !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $addrListRef;
}
} else {
foreach my $addr (@$addrListRef) {
my $name = $addr->name;
my $address = $addr->address;
if (defined $name and $name !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $name;
}
if (defined $address and $address !~ /^\s*$/) {
$parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/; $parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
$parsed{"${fieldName}Addr"} .= $addr->address; $parsed{"${fieldName}Addr"} .= $address;
}
}
}
} }
} elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) { } elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) {
$parsed{Date} = ParseDate($item); $parsed{Date} = ParseDate($item);
@ -155,15 +191,17 @@ sub ProcessMailDir($$$) {
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml'; UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
die "$fileName has no subject" if not defined $parsed{Subject}; die "$fileName has no subject" if not defined $parsed{Subject};
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') { if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
my $dateFormatted = UnixDate($parsed{Date}, "%D");
$dateFormatted = "N/A" if not defined $dateFormatted or $dateFormatted =~ /^\s*$/;
push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject}, push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject},
UnixDate("%D", $parsed{Date}), $parsed{ToName}, $parsed{ToAddr}, $dateFormatted, $parsed{ToName}, $parsed{ToAddr},
$parsed{CcName}, $parsed{CcAddr}, "", $GROUP ]); $parsed{CCName}, $parsed{CCAddr}, "", $GROUP ]);
} else { } else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP ]); push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
} }
my $copiedFile = catfile($outputDir, $fileName); my $copiedFile = catfile($outputDir, $fileName);
copy($msgFile, $copiedFile); copy($msgFile, $copiedFile) or die "unable to copy($msgFile, $copiedFile)";
system('/usr/bin/unix2dos', '-q', $copiedFile); system('/usr/bin/unix2dos', '-q', $copiedFile);
die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile; die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile;
} }
@ -187,19 +225,29 @@ while (my $rfp = readdir $topDH) {
my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName); my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName);
die "regular file found where we expected a type in $typeName" unless -d $typeDirName; die "regular file found where we expected a type in $typeName" unless -d $typeDirName;
my($native, $numbered) = ('produce-native', 'produce-numbered'); my($native, $numbered) = ('produce-native', 'produce-numbered');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') { if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
($native, $numbered) = ('priv-native', 'priv-numbered'); ($native, $numbered) = ('priv-native', 'priv-numbered');
} }
my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName)); my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName));
my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName)); my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName));
make_path($nativeOutputDirOneUp, { mode => 0755 }); unless (-d $nativeOutputDirOneUp) {
make_path($numberedOutputDir, { mode => 0755 }); make_path($nativeOutputDirOneUp, { mode => 0755 }) or die "unable to create path $nativeOutputDirOneUp: $!";
}
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to create path $numberedOutputDir: $!";
}
my $destDir = catfile($nativeOutputDirOneUp, $typeName);
if ($typeName =~ /email/i) { if ($typeName =~ /email/i) {
ProcessMailDir($rfp, $typeDirName, $numberedOutputDir); ProcessMailDir($rfp, $typeDirName, $numberedOutputDir);
move($typeDirName, $nativeOutputDirOneUp);
} else { } else {
ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir); ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir);
} }
die "cannot move to the directory we want this in" unless -d $nativeOutputDirOneUp;
rename($typeDirName, $destDir) or die "unable to move $typeDirName to $destDir: $!";
# move($typeDirName, $nativeOutputDirOneUp) or die "unable to move($typeDirName, $nativeOutputDirOneUp)";
# Note: the above doesn't atually rename the directory from one place
# to another; it moves the file contents into the destination directory. IOW, File::Copy->move() doesn't have POSIX mv
} }
closedir $bucketDH; closedir $bucketDH;
} }
@ -216,11 +264,6 @@ print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends
open($upiFH, ">", $upiNumberFile); open($upiFH, ">", $upiNumberFile);
print $upiFH ++$upiCurrentNum, "\n"; print $upiFH ++$upiCurrentNum, "\n";
close $upiFH; close $upiFH;
#make_path(, {
# verbose => 1,
# mode => 0755,
#});
############################################################################### ###############################################################################
# #
# Local variables: # Local variables: