271 lines
12 KiB
Perl
Executable file
271 lines
12 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
# This expects both the input and output directory to be organized this way:
|
|
# rfp-NN/BUCKET/TYPE/
|
|
# Note that items from input will be moved to output if they are included in group
|
|
#
|
|
# output directory ends up with four subdirectories: produce-native/ and produce-numbered/, priv-native, priv-numbered
|
|
# native is where things are just moved over at the end
|
|
# numbered is where the numbered items go
|
|
|
|
use utf8;
|
|
|
|
use strict;
|
|
use warnings;
|
|
use autodie qw(:all);
|
|
|
|
use open qw/ :std :encoding(utf-8) /;
|
|
|
|
use Getopt::Long;
|
|
use File::Spec::Functions;
|
|
use File::Spec;
|
|
use File::Path qw(make_path);
|
|
use Mail::Header;
|
|
use Email::Address::XS qw(parse_email_groups);
|
|
use File::Copy;
|
|
use Date::Manip::DM6 qw(ParseDate UnixDate);
|
|
use Text::CSV; # libtext-csv-perl in Debian
|
|
use Encode qw/encode decode/;
|
|
|
|
my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGED', privileged => 'PRIVILEGED',
|
|
'journalist-privilege' => 'PRIVILEGED' );
|
|
|
|
sub UsageAndExit($) {
|
|
print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n";
|
|
print STDERR "\n $_[0]\n";
|
|
exit 2;
|
|
}
|
|
|
|
my($VERBOSE, $INPUT_TOPLEVEL_DIR, $OUTPUT_TOPLEVEL_DIR, $GROUP) =
|
|
(0, undef, undef, undef, undef, undef);
|
|
|
|
GetOptions("verbose=i" => \$VERBOSE, "inputToplevelDir=s", \$INPUT_TOPLEVEL_DIR, 'group=s' => \$GROUP,
|
|
"outputToplevelDir=s", \$OUTPUT_TOPLEVEL_DIR) or UsageAndExit("invalid options");
|
|
|
|
{ no warnings 'uninitialized';
|
|
UsageAndExit("\"$INPUT_TOPLEVEL_DIR\" is not a readable directory")
|
|
unless defined $INPUT_TOPLEVEL_DIR and -r $INPUT_TOPLEVEL_DIR and -d $INPUT_TOPLEVEL_DIR;
|
|
|
|
UsageAndExit("\"$OUTPUT_TOPLEVEL_DIR\" is not a readable directory")
|
|
unless defined $OUTPUT_TOPLEVEL_DIR and -r $OUTPUT_TOPLEVEL_DIR and -d $OUTPUT_TOPLEVEL_DIR;
|
|
|
|
UsageAndExit("\"$GROUP\" is not a valid group")
|
|
unless defined $GROUP and defined $GROUP_NAMES_BY_DIR{$GROUP};
|
|
}
|
|
|
|
my $upiNumberFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "upi-number-start.txt"));
|
|
|
|
UsageAndExit("\"$upiNumberFile\" is not a readable file")
|
|
unless -r $upiNumberFile and -f $upiNumberFile;
|
|
open(my $upiFH, '<', $upiNumberFile);
|
|
my $count = 0;
|
|
my $upiStart = -1;
|
|
while (my $line = <$upiFH>) {
|
|
$count++;
|
|
chomp $line;
|
|
UsageAndExit("\"$upiNumberFile\" must contain a number only!") unless $line =~ /^\s*(\d+)\s*$/;
|
|
$upiStart = $1;
|
|
}
|
|
close $upiFH;
|
|
UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0;
|
|
|
|
my $upiCurrentNum = $upiStart;
|
|
sub NextUPI () { return sprintf("UPI-SFC-%07d", $upiCurrentNum++); }
|
|
|
|
my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1});
|
|
my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv"));
|
|
UsageAndExit("\"$csvLogFile\" cannot already exist! Do not attempt to number the same group twice!")
|
|
if (-e $csvLogFile);
|
|
|
|
my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE',
|
|
'PROTECTIVE ORDER CATEGORY');
|
|
|
|
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
|
|
@headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS',
|
|
'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS',
|
|
'PRIVILEGE CLAIMED');
|
|
}
|
|
|
|
my @CSV_OUTPUT_ROWS;
|
|
|
|
sub ProcessDocumentDirectory($$$);
|
|
|
|
sub ProcessDocumentDirectory($$$) {
|
|
my($rfp, $inputDocDir, $numberedOutputDir) = @_;
|
|
|
|
opendir(my $inputDH, $inputDocDir);
|
|
while (my $file = readdir $inputDH) {
|
|
next if $file =~ /^\s*\.\.?\s*$/;
|
|
my $fullFilePath = catfile($inputDocDir, $file);
|
|
if (-d $fullFilePath) {
|
|
ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file));
|
|
} elsif (-f $fullFilePath) {
|
|
my $upiFull = NextUPI();
|
|
unless (-d $numberedOutputDir) {
|
|
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to make directory $numberedOutputDir: $!";
|
|
}
|
|
my($volume, $directories, $bareFileName) = File::Spec->splitpath($fullFilePath);
|
|
die("Something wrong, since file name is empty on $fullFilePath") unless defined $bareFileName and $bareFileName !~ /^\s*$/;
|
|
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . $bareFileName;
|
|
my $copiedFile = catfile($numberedOutputDir, $fileName);
|
|
copy($fullFilePath, $copiedFile)
|
|
or die "unable to copy($fullFilePath, catfile($numberedOutputDir, $fileName))";
|
|
system('/usr/bin/unix2dos', '-q', $copiedFile) if (-T $copiedFile);
|
|
die "unable to copy $fullFilePath to $copiedFile" unless -f $copiedFile;
|
|
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
|
|
push(@CSV_OUTPUT_ROWS, [ $upiFull, "", "", $fileName, "N/A", "", "", "", "", "", $GROUP ]);
|
|
} else {
|
|
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
|
|
}
|
|
|
|
} else {
|
|
die("\"$fullFilePath\" is a strange file type, not handled!");
|
|
}
|
|
}
|
|
closedir $inputDH;
|
|
}
|
|
######################################################################
|
|
sub ProcessMailDir($$$) {
|
|
my($rfp, $inputMailDir, $outputDir) = @_;
|
|
|
|
my @msgDirs = (catfile($inputMailDir, 'new'), catfile($inputMailDir, 'cur'));
|
|
foreach my $dir (@msgDirs) {
|
|
die "$inputMailDir must be a readble maildir folder but $dir isn't a readable directory: $!"
|
|
unless (-d $inputMailDir);
|
|
}
|
|
foreach my $dir (@msgDirs) {
|
|
opendir(my $mailDirFH, $dir);
|
|
while (my $file = readdir $mailDirFH) {
|
|
next if -d $file; # skip directories
|
|
my $msgFile = catfile($dir, $file);
|
|
open(my $msgFH, "<", $msgFile);
|
|
my $upiFull = NextUPI();
|
|
my $header = new Mail::Header($msgFH);
|
|
my $fields = $header->header_hashref;
|
|
my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CCName => '', CCAddr => '', 'Subject' => '',
|
|
Date => '');
|
|
use Data::Dumper;
|
|
foreach my $fieldName (qw/From To CC Cc Subject Date/) {
|
|
foreach my $item (@{$fields->{$fieldName}}) {
|
|
chomp $item;
|
|
if ($fieldName =~ /From|To|CC/i) {
|
|
my @groups = parse_email_groups($item);
|
|
while ( my($groupName, $addrListRef) = each @groups) {
|
|
if (defined $groupName and $groupName !~ /^[01\s*]$/) {
|
|
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
|
|
$parsed{"${fieldName}Name"} .= $groupName;
|
|
}
|
|
if (not ref $addrListRef) {
|
|
if (defined $addrListRef and $addrListRef !~ /^\s*$/) {
|
|
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
|
|
$parsed{"${fieldName}Name"} .= $addrListRef;
|
|
}
|
|
} else {
|
|
foreach my $addr (@$addrListRef) {
|
|
my $name = $addr->name;
|
|
my $address = $addr->address;
|
|
if (defined $name and $name !~ /^\s*$/) {
|
|
$parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
|
|
$parsed{"${fieldName}Name"} .= $name;
|
|
}
|
|
if (defined $address and $address !~ /^\s*$/) {
|
|
$parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
|
|
$parsed{"${fieldName}Addr"} .= $address;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) {
|
|
$parsed{Date} = ParseDate($item);
|
|
die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/;
|
|
} else {
|
|
$parsed{$fieldName} = decode('MIME-Header', $item);
|
|
}
|
|
}
|
|
}
|
|
close $msgFH;
|
|
$parsed{Subject} =~ s/[\r\n]/ /gm;
|
|
my $subjectDashes = $parsed{Subject};
|
|
$subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g;
|
|
$subjectDashes = substr($subjectDashes, 0, 127);
|
|
my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
|
|
UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
|
|
die "$fileName has no subject" if not defined $parsed{Subject};
|
|
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
|
|
my $dateFormatted = UnixDate($parsed{Date}, "%D");
|
|
$dateFormatted = "N/A" if not defined $dateFormatted or $dateFormatted =~ /^\s*$/;
|
|
push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject},
|
|
$dateFormatted, $parsed{ToName}, $parsed{ToAddr},
|
|
$parsed{CCName}, $parsed{CCAddr}, "", $GROUP ]);
|
|
} else {
|
|
push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
|
|
}
|
|
my $copiedFile = catfile($outputDir, $fileName);
|
|
copy($msgFile, $copiedFile) or die "unable to copy($msgFile, $copiedFile)";
|
|
system('/usr/bin/unix2dos', '-q', $copiedFile);
|
|
die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile;
|
|
}
|
|
closedir $mailDirFH;
|
|
}
|
|
}
|
|
######################################################################
|
|
# Main Loop:
|
|
|
|
opendir(my $topDH, $INPUT_TOPLEVEL_DIR);
|
|
while (my $rfp = readdir $topDH) {
|
|
next unless $rfp =~ /^\s*rfp-(\d+)\s*$/;
|
|
my $inRfpDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp);
|
|
opendir(my $rfpDH, $inRfpDirName);
|
|
while (my $bucketName = readdir $rfpDH) {
|
|
next unless $bucketName eq $GROUP;
|
|
my $inBucketDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName);
|
|
opendir(my $bucketDH, $inBucketDirName);
|
|
while (my $typeName = readdir $bucketDH) {
|
|
next if $typeName =~ /^\s*\.\.?\s*$/;
|
|
my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName);
|
|
die "regular file found where we expected a type in $typeName" unless -d $typeDirName;
|
|
my($native, $numbered) = ('produce-native', 'produce-numbered');
|
|
if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
|
|
($native, $numbered) = ('priv-native', 'priv-numbered');
|
|
}
|
|
my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName));
|
|
my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName));
|
|
unless (-d $nativeOutputDirOneUp) {
|
|
make_path($nativeOutputDirOneUp, { mode => 0755 }) or die "unable to create path $nativeOutputDirOneUp: $!";
|
|
}
|
|
unless (-d $numberedOutputDir) {
|
|
make_path($numberedOutputDir, { mode => 0755 }) or die "unable to create path $numberedOutputDir: $!";
|
|
}
|
|
my $destDir = catfile($nativeOutputDirOneUp, $typeName);
|
|
if ($typeName =~ /email/i) {
|
|
ProcessMailDir($rfp, $typeDirName, $numberedOutputDir);
|
|
} else {
|
|
ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir);
|
|
}
|
|
die "cannot move to the directory we want this in" unless -d $nativeOutputDirOneUp;
|
|
rename($typeDirName, $destDir) or die "unable to move $typeDirName to $destDir: $!";
|
|
# move($typeDirName, $nativeOutputDirOneUp) or die "unable to move($typeDirName, $nativeOutputDirOneUp)";
|
|
|
|
# Note: the above doesn't atually rename the directory from one place
|
|
# to another; it moves the file contents into the destination directory. IOW, File::Copy->move() doesn't have POSIX mv
|
|
}
|
|
closedir $bucketDH;
|
|
}
|
|
closedir $rfpDH;
|
|
}
|
|
closedir $topDH;
|
|
|
|
open my $csvFH, ">:encoding(utf8)", $csvLogFile;
|
|
|
|
$csvOutFormat->say($csvFH, \@headerFields);
|
|
$csvOutFormat->say($csvFH, $_) for @CSV_OUTPUT_ROWS; close $csvFH;
|
|
|
|
print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends at $upiCurrentNum\n";
|
|
open($upiFH, ">", $upiNumberFile);
|
|
print $upiFH ++$upiCurrentNum, "\n";
|
|
close $upiFH;
|
|
###############################################################################
|
|
#
|
|
# Local variables:
|
|
# compile-command: "perl -c build-label-upi-number.plx"
|
|
# End:
|