#!/usr/bin/perl # This expects both the input and output directory to be organized this way: # rfp-NN/BUCKET/TYPE/ # Note that items from input will be moved to output if they are included in group # # output directory ends up with two subdirectories: native/ and numbered/ # native is where things are just moved over at the end # numbered is where the numbered items go use utf8; use strict; use warnings; use autodie qw(:all); use open qw/ :std :encoding(utf-8) /; use Getopt::Long; use File::Spec::Functions; use File::Spec; use File::Path qw(make_path); use Mail::Header; use Email::Address::XS; use File::Copy; use Date::Manip::DM6 qw(ParseDate UnixDate); use Text::CSV; # libtext-csv-perl in Debian my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGE', privileged => 'PRIVILEGE', 'journalist-privilege' => 'PRIVILEGE' ); sub UsageAndExit($) { print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [ --verbose=N ]\n"; print STDERR "\n $_[0]\n"; exit 2; } my($VERBOSE, $INPUT_TOPLEVEL_DIR, $OUTPUT_TOPLEVEL_DIR, $GROUP) = (0, undef, undef, undef, undef, undef); GetOptions("verbose=i" => \$VERBOSE, "inputToplevelDir=s", \$INPUT_TOPLEVEL_DIR, 'group=s' => \$GROUP, "outputToplevelDir=s", \$OUTPUT_TOPLEVEL_DIR) or UsageAndExit("invalid options"); { no warnings 'uninitialized'; UsageAndExit("\"$INPUT_TOPLEVEL_DIR\" is not a readable directory") unless defined $INPUT_TOPLEVEL_DIR and -r $INPUT_TOPLEVEL_DIR and -d $INPUT_TOPLEVEL_DIR; UsageAndExit("\"$OUTPUT_TOPLEVEL_DIR\" is not a readable directory") unless defined $OUTPUT_TOPLEVEL_DIR and -r $OUTPUT_TOPLEVEL_DIR and -d $OUTPUT_TOPLEVEL_DIR; UsageAndExit("\"$GROUP\" is not a valid group") unless defined $GROUP and defined $GROUP_NAMES_BY_DIR{$GROUP}; } my $upiNumberFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "upi-number-start.txt")); UsageAndExit("\"$upiNumberFile\" is not a readable file") unless -r $upiNumberFile and -f $upiNumberFile; open(my $upiFH, '<', $upiNumberFile); my $count = 0; my $upiStart = -1; while (my $line = <$upiFH>) { $count++; chomp $line; UsageAndExit("\"$upiNumberFile\" must contain a number only!") unless $line =~ /^\s*(\d+)\s*$/; $upiStart = $1; } close $upiFH; UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0; my $upiCurrentNum = $upiStart; my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1}); my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv")); UsageAndExit("\"$csvLogFile\" cannot already exist! Do not attempt to number the same group twice!") if (-e $csvLogFile); my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE'); if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') { @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS', 'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS', 'PRIVILEGE CLAIMED'); } my @CSV_OUTPUT_ROWS; sub ProcessMailDir($$$) { my($rfp, $inputMailDir, $outputDir) = @_; my @msgDirs = (catfile($inputMailDir, 'new'), catfile($inputMailDir, 'cur')); foreach my $dir (@msgDirs) { die "$inputMailDir must be a readble maildir folder but $dir isn't a readable directory: $!" unless (-d $inputMailDir); } foreach my $dir (@msgDirs) { opendir(my $mailDirFH, $dir); while (my $file = readdir $mailDirFH) { next if -d $file; # skip directories my $msgFile = catfile($dir, $file); open(my $msgFH, "<", $msgFile); my $upiFull = sprintf("UPI-SFC-%07d", $upiCurrentNum++); my $header = new Mail::Header($msgFH); my $fields = $header->header_hashref; my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CcName => '', CcAddr => '', 'Subject' => '', Date => ''); foreach my $fieldName (qw/From To Cc Subject Date/) { foreach my $item (@{$fields->{$fieldName}}) { chomp $item; if ($fieldName =~ /From|To|Cc/) { my $addr = Email::Address::XS->parse($item); if ($addr->name ne "") { $parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/; $parsed{"${fieldName}Name"} .= $addr->name; } if ($addr->address ne "") { $parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/; $parsed{"${fieldName}Addr"} .= $addr->address; } } elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) { $parsed{Date} = ParseDate($item); die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/; } else { $parsed{$fieldName} = $item; } } } close $msgFH; $parsed{Subject} =~ s/[\r\n]/ /gm; my $subjectDashes = $parsed{Subject}; $subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g; my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml'; die "$fileName has no subject" if not defined $parsed{Subject}; if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGE') { push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject}, UnixDate("%D", $parsed{Date}), $parsed{ToName}, $parsed{ToAddr}, $parsed{CcName}, $parsed{CcAddr}, "", $GROUP ]); } else { push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp) ]); } my $copiedFile = catfile($outputDir, $fileName); copy($msgFile, $copiedFile); system('/usr/bin/unix2dos', '-q', $copiedFile); die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile; } closedir $mailDirFH; } } # Main Loop: opendir(my $topDH, $INPUT_TOPLEVEL_DIR); while (my $rfp = readdir $topDH) { next unless $rfp =~ /^\s*rfp-(\d+)\s*$/; my $inRfpDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp); opendir(my $rfpDH, $inRfpDirName); while (my $bucketName = readdir $rfpDH) { next unless $bucketName eq $GROUP; my $inBucketDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName); opendir(my $bucketDH, $inBucketDirName); while (my $typeName = readdir $bucketDH) { next if $typeName =~ /^\s*\.\.?\s*$/; my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName); die "regular file found where we expected a type in $typeName" unless -d $typeDirName; my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, 'native', $rfp, $bucketName)); my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, 'numbered', $rfp, $bucketName, $typeName)); make_path($nativeOutputDirOneUp, { verbose => 1, mode => 0755 }); make_path($numberedOutputDir, { verbose => 1, mode => 0755 }); if ($typeName =~ /email/i) { ProcessMailDir($rfp, $typeDirName, $numberedOutputDir); move($typeDirName, $nativeOutputDirOneUp); } } closedir $bucketDH; } closedir $rfpDH; } closedir $topDH; open my $csvFH, ">:encoding(utf8)", $csvLogFile; $csvOutFormat->say($csvFH, \@headerFields); $csvOutFormat->say($csvFH, $_) for @CSV_OUTPUT_ROWS; close $csvFH; print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends at $upiCurrentNum\n"; open($upiFH, ">", $upiNumberFile); print $upiFH ++$upiCurrentNum, "\n"; close $upiFH; #make_path(, { # verbose => 1, # mode => 0755, #}); ############################################################################### # # Local variables: # compile-command: "perl -c build-label-upi-number.plx" # End: