gpl-compliance-tools/build-label-upi-number.plx
Bradley M. Kuhn 3f4a63dd3f Non-email production files should be entered in spreadsheet output
This was simply an oversight when I wrote ProcessDocumentDirectory()
2023-06-03 11:13:03 -07:00

271 lines
12 KiB
Perl
Executable file

#!/usr/bin/perl
# This expects both the input and output directory to be organized this way:
# rfp-NN/BUCKET/TYPE/
# Note that items from input will be moved to output if they are included in group
#
# output directory ends up with four subdirectories: produce-native/ and produce-numbered/, priv-native, priv-numbered
# native is where things are just moved over at the end
# numbered is where the numbered items go
use utf8;
use strict;
use warnings;
use autodie qw(:all);
use open qw/ :std :encoding(utf-8) /;
use Getopt::Long;
use File::Spec::Functions;
use File::Spec;
use File::Path qw(make_path);
use Mail::Header;
use Email::Address::XS qw(parse_email_groups);
use File::Copy;
use Date::Manip::DM6 qw(ParseDate UnixDate);
use Text::CSV; # libtext-csv-perl in Debian
use Encode qw/encode decode/;
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGED', privileged => 'PRIVILEGED',
'journalist-privilege' => 'PRIVILEGED' );
sub UsageAndExit($) {
print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n";
print STDERR "\n $_[0]\n";
exit 2;
}
my($VERBOSE, $INPUT_TOPLEVEL_DIR, $OUTPUT_TOPLEVEL_DIR, $GROUP) =
(0, undef, undef, undef, undef, undef);
GetOptions("verbose=i" => \$VERBOSE, "inputToplevelDir=s", \$INPUT_TOPLEVEL_DIR, 'group=s' => \$GROUP,
"outputToplevelDir=s", \$OUTPUT_TOPLEVEL_DIR) or UsageAndExit("invalid options");
{ no warnings 'uninitialized';
UsageAndExit("\"$INPUT_TOPLEVEL_DIR\" is not a readable directory")
unless defined $INPUT_TOPLEVEL_DIR and -r $INPUT_TOPLEVEL_DIR and -d $INPUT_TOPLEVEL_DIR;
UsageAndExit("\"$OUTPUT_TOPLEVEL_DIR\" is not a readable directory")
unless defined $OUTPUT_TOPLEVEL_DIR and -r $OUTPUT_TOPLEVEL_DIR and -d $OUTPUT_TOPLEVEL_DIR;
UsageAndExit("\"$GROUP\" is not a valid group")
unless defined $GROUP and defined $GROUP_NAMES_BY_DIR{$GROUP};
}
my $upiNumberFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "upi-number-start.txt"));
UsageAndExit("\"$upiNumberFile\" is not a readable file")
unless -r $upiNumberFile and -f $upiNumberFile;
open(my $upiFH, '<', $upiNumberFile);
my $count = 0;
my $upiStart = -1;
while (my $line = <$upiFH>) {
$count++;
chomp $line;
UsageAndExit("\"$upiNumberFile\" must contain a number only!") unless $line =~ /^\s*(\d+)\s*$/;
$upiStart = $1;
}
close $upiFH;
UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0;
my $upiCurrentNum = $upiStart;
sub NextUPI () { return sprintf("UPI-SFC-%07d", $upiCurrentNum++); }
my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1});
my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv"));
UsageAndExit("\"$csvLogFile\" cannot already exist! Do not attempt to number the same group twice!")
if (-e $csvLogFile);
my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE',
'PROTECTIVE ORDER CATEGORY');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
@headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS',
'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS',
'PRIVILEGE CLAIMED');
}
my @CSV_OUTPUT_ROWS;
sub ProcessDocumentDirectory($$$);
sub ProcessDocumentDirectory($$$) {
my($rfp, $inputDocDir, $numberedOutputDir) = @_;
opendir(my $inputDH, $inputDocDir);
while (my $file = readdir $inputDH) {
next if $file =~ /^\s*\.\.?\s*$/;
my $fullFilePath = catfile($inputDocDir, $file);
if (-d $fullFilePath) {
ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file));
} elsif (-f $fullFilePath) {
my $upiFull = NextUPI();
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to make directory $numberedOutputDir: $!";
}
my($volume, $directories, $bareFileName) = File::Spec->splitpath($fullFilePath);
die("Something wrong, since file name is empty on $fullFilePath") unless defined $bareFileName and $bareFileName !~ /^\s*$/;
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . $bareFileName;
my $copiedFile = catfile($numberedOutputDir, $fileName);
copy($fullFilePath, $copiedFile)
or die "unable to copy($fullFilePath, catfile($numberedOutputDir, $fileName))";
system('/usr/bin/unix2dos', '-q', $copiedFile) if (-T $copiedFile);
die "unable to copy $fullFilePath to $copiedFile" unless -f $copiedFile;
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
push(@CSV_OUTPUT_ROWS, [ $upiFull, "", "", $fileName, "N/A", "", "", "", "", "", $GROUP ]);
} else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
}
} else {
die("\"$fullFilePath\" is a strange file type, not handled!");
}
}
closedir $inputDH;
}
######################################################################
sub ProcessMailDir($$$) {
my($rfp, $inputMailDir, $outputDir) = @_;
my @msgDirs = (catfile($inputMailDir, 'new'), catfile($inputMailDir, 'cur'));
foreach my $dir (@msgDirs) {
die "$inputMailDir must be a readble maildir folder but $dir isn't a readable directory: $!"
unless (-d $inputMailDir);
}
foreach my $dir (@msgDirs) {
opendir(my $mailDirFH, $dir);
while (my $file = readdir $mailDirFH) {
next if -d $file; # skip directories
my $msgFile = catfile($dir, $file);
open(my $msgFH, "<", $msgFile);
my $upiFull = NextUPI();
my $header = new Mail::Header($msgFH);
my $fields = $header->header_hashref;
my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CCName => '', CCAddr => '', 'Subject' => '',
Date => '');
use Data::Dumper;
foreach my $fieldName (qw/From To CC Cc Subject Date/) {
foreach my $item (@{$fields->{$fieldName}}) {
chomp $item;
if ($fieldName =~ /From|To|CC/i) {
my @groups = parse_email_groups($item);
while ( my($groupName, $addrListRef) = each @groups) {
if (defined $groupName and $groupName !~ /^[01\s*]$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $groupName;
}
if (not ref $addrListRef) {
if (defined $addrListRef and $addrListRef !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $addrListRef;
}
} else {
foreach my $addr (@$addrListRef) {
my $name = $addr->name;
my $address = $addr->address;
if (defined $name and $name !~ /^\s*$/) {
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
$parsed{"${fieldName}Name"} .= $name;
}
if (defined $address and $address !~ /^\s*$/) {
$parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
$parsed{"${fieldName}Addr"} .= $address;
}
}
}
}
} elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) {
$parsed{Date} = ParseDate($item);
die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/;
} else {
$parsed{$fieldName} = decode('MIME-Header', $item);
}
}
}
close $msgFH;
$parsed{Subject} =~ s/[\r\n]/ /gm;
my $subjectDashes = $parsed{Subject};
$subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g;
$subjectDashes = substr($subjectDashes, 0, 127);
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
die "$fileName has no subject" if not defined $parsed{Subject};
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
my $dateFormatted = UnixDate($parsed{Date}, "%D");
$dateFormatted = "N/A" if not defined $dateFormatted or $dateFormatted =~ /^\s*$/;
push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject},
$dateFormatted, $parsed{ToName}, $parsed{ToAddr},
$parsed{CCName}, $parsed{CCAddr}, "", $GROUP ]);
} else {
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
}
my $copiedFile = catfile($outputDir, $fileName);
copy($msgFile, $copiedFile) or die "unable to copy($msgFile, $copiedFile)";
system('/usr/bin/unix2dos', '-q', $copiedFile);
die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile;
}
closedir $mailDirFH;
}
}
######################################################################
# Main Loop:
opendir(my $topDH, $INPUT_TOPLEVEL_DIR);
while (my $rfp = readdir $topDH) {
next unless $rfp =~ /^\s*rfp-(\d+)\s*$/;
my $inRfpDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp);
opendir(my $rfpDH, $inRfpDirName);
while (my $bucketName = readdir $rfpDH) {
next unless $bucketName eq $GROUP;
my $inBucketDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName);
opendir(my $bucketDH, $inBucketDirName);
while (my $typeName = readdir $bucketDH) {
next if $typeName =~ /^\s*\.\.?\s*$/;
my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName);
die "regular file found where we expected a type in $typeName" unless -d $typeDirName;
my($native, $numbered) = ('produce-native', 'produce-numbered');
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
($native, $numbered) = ('priv-native', 'priv-numbered');
}
my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName));
my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName));
unless (-d $nativeOutputDirOneUp) {
make_path($nativeOutputDirOneUp, { mode => 0755 }) or die "unable to create path $nativeOutputDirOneUp: $!";
}
unless (-d $numberedOutputDir) {
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to create path $numberedOutputDir: $!";
}
my $destDir = catfile($nativeOutputDirOneUp, $typeName);
if ($typeName =~ /email/i) {
ProcessMailDir($rfp, $typeDirName, $numberedOutputDir);
} else {
ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir);
}
die "cannot move to the directory we want this in" unless -d $nativeOutputDirOneUp;
rename($typeDirName, $destDir) or die "unable to move $typeDirName to $destDir: $!";
# move($typeDirName, $nativeOutputDirOneUp) or die "unable to move($typeDirName, $nativeOutputDirOneUp)";
# Note: the above doesn't atually rename the directory from one place
# to another; it moves the file contents into the destination directory. IOW, File::Copy->move() doesn't have POSIX mv
}
closedir $bucketDH;
}
closedir $rfpDH;
}
closedir $topDH;
open my $csvFH, ">:encoding(utf8)", $csvLogFile;
$csvOutFormat->say($csvFH, \@headerFields);
$csvOutFormat->say($csvFH, $_) for @CSV_OUTPUT_ROWS; close $csvFH;
print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends at $upiCurrentNum\n";
open($upiFH, ">", $upiNumberFile);
print $upiFH ++$upiCurrentNum, "\n";
close $upiFH;
###############################################################################
#
# Local variables:
# compile-command: "perl -c build-label-upi-number.plx"
# End: