Compare commits

...

10 commits

Author SHA1 Message Date
Bradley M. Kuhn
3f4a63dd3f Non-email production files should be entered in spreadsheet output
This was simply an oversight when I wrote ProcessDocumentDirectory()
2023-06-03 11:13:03 -07:00
Bradley M. Kuhn
db9a80723f Rework address code to properly catch when multiple address appear
As previously written, only the first address was found.  The key is
that the Email::Address::XS library was misused previously.  Based on
what is returned by Mail::Header->header_hashref(), it's clear that
we should use Email::Address::XS->parse_email_groups() first to
extract all addresses first.
2023-06-03 11:01:14 -07:00
Bradley M. Kuhn
347e0d3113 The Mail::Header library apparently returns CC, never Cc. 2023-06-03 10:14:40 -07:00
Bradley M. Kuhn
cdcf26f8fa Correct UnixDate() call.
Arguments were out of order, first of all, but also we might as well
add a check to verify that if it's empty and put "N/A" there if
nothing is there.
2023-06-03 09:54:37 -07:00
Bradley M. Kuhn
ea04654b7f Convert copied text files to DOS \r\n format relying on Perl's -T
While it may not be ideal here to rely on Perl's -T, since the
`man perlfunc` says that -T uses heuristic, and maybe the better
option would be to call `/usr/bin/file` and parse its output to see
if such conversion is needed, this solution should be close enough.
2023-06-03 08:40:28 -07:00
Bradley M. Kuhn
3f0716c9f0 Improve move/copy/make_path error handling; switch move() ⇒ rename()
It turns out that File::Copy->move() does *not* follow POSIX mv's
semantics when doing `mv DIRECTORY_1 DIRECTORY_2`.  It's quite clear
that it will do somethine like `mv DIRECTORY_1/* DIRECTORY_2`.

As such, while I'd prefer not to use the system-dependent rename()
Perl function here, that has the semantics I want.

In the process, error handling for the clals to move(), copy() and
make_path are improved.  I thought autodie was catching these, but
it's not.
2023-06-03 08:40:18 -07:00
Bradley M. Kuhn
9da387af88 Numbering, moving, and copying for non-email files completed. 2023-06-03 07:56:39 -07:00
Bradley M. Kuhn
c2cadeaa6a Create function for generating next UPI number. 2023-06-03 07:55:53 -07:00
Bradley M. Kuhn
bf0fd2ab75 Correct designation output.
"PRIVILEGED" not "PRIVILEGE"

Make sure designation is in CSV file properly.
2023-06-03 07:54:59 -07:00
Bradley M. Kuhn
34b6ba4cb8 Delete stray note. 2023-06-03 07:54:27 -07:00

View file

@ -21,14 +21,14 @@ use File::Spec::Functions;
use File::Spec;
use File::Path qw(make_path);
use Mail::Header;
use Email::Address::XS;
use Email::Address::XS qw(parse_email_groups);
use File::Copy;
use Date::Manip::DM6 qw(ParseDate UnixDate);
use Text::CSV; # libtext-csv-perl in Debian
use Encode qw/encode decode/;
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGE', privileged => 'PRIVILEGE',
'journalist-privilege' => 'PRIVILEGE' );
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGED', privileged => 'PRIVILEGED',
'journalist-privilege' => 'PRIVILEGED' );
sub UsageAndExit($) {
print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n";
@ -70,6 +70,7 @@ close $upiFH;
UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0;
my $upiCurrentNum = $upiStart;
sub NextUPI () { return sprintf("UPI-SFC-%07d", $upiCurrentNum++); }
my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1});
my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv"));
@ -79,7 +80,7 @@ UsageAndExit("\"$csvLogFile\" cannot already exist! Do not attempt to number th
my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE',
'PROTECTIVE ORDER CATEGORY');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') {
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
@headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS',
'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS',
'PRIVILEGE CLAIMED');
@ -99,7 +100,24 @@ sub ProcessDocumentDirectory($$$) {
if (-d $fullFilePath) {
ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file));
} elsif (-f $fullFilePath) {
print " mv $fullFilePath ", catfile($numberedOutputDir, $file), "\n";
my $upiFull = NextUPI();
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to make directory $numberedOutputDir: $!";
}
my($volume, $directories, $bareFileName) = File::Spec->splitpath($fullFilePath);
die("Something wrong, since file name is empty on $fullFilePath") unless defined $bareFileName and $bareFileName !~ /^\s*$/;
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . $bareFileName;
my $copiedFile = catfile($numberedOutputDir, $fileName);
copy($fullFilePath, $copiedFile)
or die "unable to copy($fullFilePath, catfile($numberedOutputDir, $fileName))";
system('/usr/bin/unix2dos', '-q', $copiedFile) if (-T $copiedFile);
die "unable to copy $fullFilePath to $copiedFile" unless -f $copiedFile;
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
push(@CSV_OUTPUT_ROWS, [ $upiFull, "", "", $fileName, "N/A", "", "", "", "", "", $GROUP ]);
} else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
}
} else {
die("\"$fullFilePath\" is a strange file type, not handled!");
}
@ -121,23 +139,41 @@ sub ProcessMailDir($$$) {
next if -d $file; # skip directories
my $msgFile = catfile($dir, $file);
open(my $msgFH, "<", $msgFile);
my $upiFull = sprintf("UPI-SFC-%07d", $upiCurrentNum++);
my $upiFull = NextUPI();
my $header = new Mail::Header($msgFH);
my $fields = $header->header_hashref;
my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CcName => '', CcAddr => '', 'Subject' => '',
my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CCName => '', CCAddr => '', 'Subject' => '',
Date => '');
foreach my $fieldName (qw/From To Cc Subject Date/) {
use Data::Dumper;
foreach my $fieldName (qw/From To CC Cc Subject Date/) {
foreach my $item (@{$fields->{$fieldName}}) {
chomp $item;
if ($fieldName =~ /From|To|Cc/) {
my $addr = Email::Address::XS->parse($item);
if ($addr->name ne "") {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $addr->name;
}
if ($addr->address ne "") {
$parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
$parsed{"${fieldName}Addr"} .= $addr->address;
if ($fieldName =~ /From|To|CC/i) {
my @groups = parse_email_groups($item);
while ( my($groupName, $addrListRef) = each @groups) {
if (defined $groupName and $groupName !~ /^[01\s*]$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $groupName;
}
if (not ref $addrListRef) {
if (defined $addrListRef and $addrListRef !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $addrListRef;
}
} else {
foreach my $addr (@$addrListRef) {
my $name = $addr->name;
my $address = $addr->address;
if (defined $name and $name !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $name;
}
if (defined $address and $address !~ /^\s*$/) {
$parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
$parsed{"${fieldName}Addr"} .= $address;
}
}
}
}
} elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) {
$parsed{Date} = ParseDate($item);
@ -155,15 +191,17 @@ sub ProcessMailDir($$$) {
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
die "$fileName has no subject" if not defined $parsed{Subject};
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') {
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
my $dateFormatted = UnixDate($parsed{Date}, "%D");
$dateFormatted = "N/A" if not defined $dateFormatted or $dateFormatted =~ /^\s*$/;
push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject},
UnixDate("%D", $parsed{Date}), $parsed{ToName}, $parsed{ToAddr},
$parsed{CcName}, $parsed{CcAddr}, "", $GROUP ]);
$dateFormatted, $parsed{ToName}, $parsed{ToAddr},
$parsed{CCName}, $parsed{CCAddr}, "", $GROUP ]);
} else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP ]);
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
}
my $copiedFile = catfile($outputDir, $fileName);
copy($msgFile, $copiedFile);
copy($msgFile, $copiedFile) or die "unable to copy($msgFile, $copiedFile)";
system('/usr/bin/unix2dos', '-q', $copiedFile);
die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile;
}
@ -187,19 +225,29 @@ while (my $rfp = readdir $topDH) {
my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName);
die "regular file found where we expected a type in $typeName" unless -d $typeDirName;
my($native, $numbered) = ('produce-native', 'produce-numbered');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') {
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
($native, $numbered) = ('priv-native', 'priv-numbered');
}
my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName));
my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName));
make_path($nativeOutputDirOneUp, { mode => 0755 });
make_path($numberedOutputDir, { mode => 0755 });
unless (-d $nativeOutputDirOneUp) {
make_path($nativeOutputDirOneUp, { mode => 0755 }) or die "unable to create path $nativeOutputDirOneUp: $!";
}
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to create path $numberedOutputDir: $!";
}
my $destDir = catfile($nativeOutputDirOneUp, $typeName);
if ($typeName =~ /email/i) {
ProcessMailDir($rfp, $typeDirName, $numberedOutputDir);
move($typeDirName, $nativeOutputDirOneUp);
} else {
ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir);
}
die "cannot move to the directory we want this in" unless -d $nativeOutputDirOneUp;
rename($typeDirName, $destDir) or die "unable to move $typeDirName to $destDir: $!";
# move($typeDirName, $nativeOutputDirOneUp) or die "unable to move($typeDirName, $nativeOutputDirOneUp)";
# Note: the above doesn't atually rename the directory from one place
# to another; it moves the file contents into the destination directory. IOW, File::Copy->move() doesn't have POSIX mv
}
closedir $bucketDH;
}
@ -216,11 +264,6 @@ print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends
open($upiFH, ">", $upiNumberFile);
print $upiFH ++$upiCurrentNum, "\n";
close $upiFH;
#make_path(, {
# verbose => 1,
# mode => 0755,
#});
###############################################################################
#
# Local variables: