gpl-compliance-tools/build-label-upi-number.plx

#!/usr/bin/perl

# This expects both the input and output directory to be organized this way:
#  rfp-NN/BUCKET/TYPE/
# Note that items from input will be moved to output if they are included in group
#
# output directory ends up with four subdirectories: produce-native/ and produce-numbered/, priv-native, priv-numbered
#  native is where things are just moved over at the end
#  numbered is where the numbered items go

use utf8;

use strict;
use warnings;
use autodie qw(:all);

use open qw/ :std :encoding(utf-8) /;

use Getopt::Long;
use File::Spec::Functions;
use File::Spec;
use File::Path qw(make_path);
use Mail::Header;
use Email::Address::XS qw(parse_email_groups);
use File::Copy;
use Date::Manip::DM6 qw(ParseDate UnixDate);
use Text::CSV;  # libtext-csv-perl in Debian
use Encode qw/encode decode/;

my %GROUP_NAMES_BY_DIR = ( confidential => 'CONFIDENTIAL', privilege => 'PRIVILEGED', privileged => 'PRIVILEGED',
                           'journalist-privilege' => 'PRIVILEGED' );

sub UsageAndExit($) {
  print STDERR "usage: $0 --inputToplevelDir=/path/to/inputdir --outputToplevelDir=/path/to/outputdir --group=group [  --verbose=N ]\n";
  print STDERR "\n  $_[0]\n";
  exit 2;
}

my($VERBOSE, $INPUT_TOPLEVEL_DIR, $OUTPUT_TOPLEVEL_DIR, $GROUP) =
  (0, undef, undef, undef, undef, undef);

GetOptions("verbose=i" => \$VERBOSE, "inputToplevelDir=s", \$INPUT_TOPLEVEL_DIR, 'group=s' => \$GROUP,
           "outputToplevelDir=s", \$OUTPUT_TOPLEVEL_DIR) or UsageAndExit("invalid options");

{ no warnings 'uninitialized';
  UsageAndExit("\"$INPUT_TOPLEVEL_DIR\" is not a readable directory")
    unless defined $INPUT_TOPLEVEL_DIR and -r $INPUT_TOPLEVEL_DIR and -d $INPUT_TOPLEVEL_DIR;

  UsageAndExit("\"$OUTPUT_TOPLEVEL_DIR\" is not a readable directory")
    unless defined $OUTPUT_TOPLEVEL_DIR and -r $OUTPUT_TOPLEVEL_DIR and -d $OUTPUT_TOPLEVEL_DIR;

  UsageAndExit("\"$GROUP\" is not a valid group")
    unless defined $GROUP and defined $GROUP_NAMES_BY_DIR{$GROUP};
}

my $upiNumberFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "upi-number-start.txt"));

UsageAndExit("\"$upiNumberFile\" is not a readable file")
  unless -r $upiNumberFile and -f $upiNumberFile;
open(my $upiFH, '<', $upiNumberFile);
my $count = 0;
my $upiStart = -1;
while (my $line = <$upiFH>) {
  $count++;
  chomp $line;
  UsageAndExit("\"$upiNumberFile\" must contain a number only!") unless $line =~ /^\s*(\d+)\s*$/;
  $upiStart = $1;
}
close $upiFH;
UsageAndExit("Error reading \"$upiNumberFile\"") unless $count == 1 and $upiStart > 0;

my $upiCurrentNum = $upiStart;
sub NextUPI () { return sprintf("UPI-SFC-%07d", $upiCurrentNum++); }

my $csvOutFormat = Text::CSV->new({ binary => 1, always_quote => 1, quote_empty => 1, blank_is_undef => 1});
my $csvLogFile = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, "${GROUP}-log.csv"));
UsageAndExit("\"$csvLogFile\" cannot already exist!  Do not attempt to number the same group twice!")
  if (-e $csvLogFile);

my @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FILE NAME', 'RFP # TO WHICH FILE IS RESPONSIVE',
                    'PROTECTIVE ORDER CATEGORY');

if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
  @headerFields = ('UNIQUE PRODUCTION IDENTIFER (UPI) #', 'FROM NAME', 'FROM ADDRESS',
                   'SUBJECT MATTER', 'SUBMIT DATE', 'TO NAME', 'TO ADDRESS', 'CC NAME', 'CC ADDRESS', 'BCC ADDRESS',
                   'PRIVILEGE CLAIMED');
}

my @CSV_OUTPUT_ROWS;

sub ProcessDocumentDirectory($$$);

sub ProcessDocumentDirectory($$$) {
  my($rfp, $inputDocDir, $numberedOutputDir) = @_;

  opendir(my $inputDH, $inputDocDir);
  while (my $file = readdir $inputDH) {
    next if $file =~ /^\s*\.\.?\s*$/;
    my $fullFilePath = catfile($inputDocDir, $file);
    if (-d $fullFilePath) {
      ProcessDocumentDirectory($rfp, $fullFilePath, catfile($numberedOutputDir, $file));
    } elsif (-f $fullFilePath) {
      my $upiFull = NextUPI();
      unless (-d $numberedOutputDir) {
        make_path($numberedOutputDir, { mode => 0755 }) or die "unable to make directory $numberedOutputDir: $!";
      }
      my($volume, $directories, $bareFileName) = File::Spec->splitpath($fullFilePath);
      die("Something wrong, since file name is empty on $fullFilePath") unless defined $bareFileName and $bareFileName !~ /^\s*$/;
      my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' . $bareFileName;
      my $copiedFile = catfile($numberedOutputDir, $fileName);
      copy($fullFilePath, $copiedFile)
        or die "unable to copy($fullFilePath, catfile($numberedOutputDir, $fileName))";
      system('/usr/bin/unix2dos', '-q', $copiedFile) if (-T $copiedFile);
      die "unable to copy $fullFilePath to $copiedFile" unless -f $copiedFile;
      if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
        push(@CSV_OUTPUT_ROWS, [ $upiFull, "", "", $fileName, "N/A", "", "", "", "", "", $GROUP ]);
      } else {
        push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
      }

    } else {
      die("\"$fullFilePath\" is a strange file type, not handled!");
    }
  }
  closedir $inputDH;
}
######################################################################
sub ProcessMailDir($$$) {
  my($rfp, $inputMailDir, $outputDir) = @_;

  my @msgDirs = (catfile($inputMailDir, 'new'), catfile($inputMailDir, 'cur'));
  foreach my $dir (@msgDirs) {
    die "$inputMailDir must be a readble maildir folder but $dir isn't a readable directory: $!"
      unless  (-d $inputMailDir);
  }
  foreach my $dir (@msgDirs) {
    opendir(my $mailDirFH, $dir);
    while (my $file = readdir $mailDirFH) {
      next if -d $file;    # skip directories
      my $msgFile = catfile($dir, $file);
      open(my $msgFH, "<", $msgFile);
      my $upiFull = NextUPI();
      my $header = new Mail::Header($msgFH);
      my $fields = $header->header_hashref;
      my %parsed = (FromName => '', ToName => '', FromAddr => "", ToAddr => "", CCName => '', CCAddr => '', 'Subject' => '',
                    Date => '');
      use Data::Dumper;
      foreach my $fieldName (qw/From To CC Cc Subject Date/) {
        foreach my $item (@{$fields->{$fieldName}}) {
          chomp $item;
          if ($fieldName =~ /From|To|CC/i) {
            my @groups = parse_email_groups($item);
            while ( my($groupName, $addrListRef) = each @groups) {
              if (defined $groupName and $groupName !~ /^[01\s*]$/) {
                $parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
                $parsed{"${fieldName}Name"} .= $groupName;
              }
              if (not ref $addrListRef) {
                if (defined $addrListRef and $addrListRef !~ /^\s*$/) {
                  $parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
                  $parsed{"${fieldName}Name"} .= $addrListRef;
                }
              } else {
                foreach my $addr (@$addrListRef) {
                  my $name = $addr->name;
                  my $address = $addr->address;
                  if (defined $name and $name !~ /^\s*$/) {
                    $parsed{"${fieldName}Name"} .= "; " if $parsed{"${fieldName}Name"} !~ /^\s*$/;
                    $parsed{"${fieldName}Name"} .= $name;
                  }
                  if (defined $address and $address !~ /^\s*$/) {
                    $parsed{"${fieldName}Addr"} .= "; " if $parsed{"${fieldName}Addr"} !~ /^\s*$/;
                    $parsed{"${fieldName}Addr"} .= $address;
                  }
                }
              }
            }
          } elsif ($fieldName eq 'Date' and $parsed{Date} =~ /^\s*$/) {
            $parsed{Date} = ParseDate($item);
            die "$item is in the date field but doesn't parse" unless defined $parsed{Date} and $parsed{Date} !~ /^\s*$/;
          } else {
            $parsed{$fieldName} = decode('MIME-Header', $item);
          }
        }
      }
      close $msgFH;
      $parsed{Subject} =~ s/[\r\n]/ /gm;
      my $subjectDashes = $parsed{Subject};
      $subjectDashes =~ s/[^a-zA-Z0-9]/_/g; $subjectDashes =~ s/_+/_/g; $subjectDashes =~ s/_+$//g; $subjectDashes =~ s/^_+//g;
      $subjectDashes = substr($subjectDashes, 0, 127);
      my $fileName = $upiFull . '-' . $GROUP_NAMES_BY_DIR{$GROUP} . '-' .
                     UnixDate($parsed{Date}, '%Y%m%d-%H%M-') . $subjectDashes . '.eml';
      die "$fileName has no subject" if not defined $parsed{Subject};
      if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
        my $dateFormatted = UnixDate($parsed{Date}, "%D");
        $dateFormatted = "N/A" if not defined $dateFormatted or $dateFormatted =~ /^\s*$/;
        push(@CSV_OUTPUT_ROWS, [ $upiFull, $parsed{FromName}, $parsed{FromAddr}, $parsed{Subject},
                                 $dateFormatted, $parsed{ToName}, $parsed{ToAddr},
                                 $parsed{CCName}, $parsed{CCAddr}, "", $GROUP ]);
      } else {
        push(@CSV_OUTPUT_ROWS, [ $upiFull, $fileName, uc($rfp), $GROUP_NAMES_BY_DIR{$GROUP} ]);
      }
      my $copiedFile = catfile($outputDir, $fileName);
      copy($msgFile, $copiedFile) or die "unable to copy($msgFile, $copiedFile)";
      system('/usr/bin/unix2dos', '-q', $copiedFile);
      die "unable to copy $msgFile to $copiedFile" unless -f $copiedFile;
    }
    closedir $mailDirFH;
  }
}
######################################################################
# Main Loop:

opendir(my $topDH, $INPUT_TOPLEVEL_DIR);
while (my $rfp = readdir $topDH) {
  next unless $rfp =~ /^\s*rfp-(\d+)\s*$/;
  my $inRfpDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp);
  opendir(my $rfpDH, $inRfpDirName);
  while (my $bucketName = readdir $rfpDH) {
    next unless $bucketName eq $GROUP;
    my $inBucketDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName);
    opendir(my $bucketDH, $inBucketDirName);
    while (my $typeName = readdir $bucketDH) {
      next if $typeName =~ /^\s*\.\.?\s*$/;
      my $typeDirName = catfile($INPUT_TOPLEVEL_DIR, $rfp, $bucketName, $typeName);
      die "regular file found where we expected a type in $typeName" unless -d $typeDirName;
      my($native, $numbered) = ('produce-native', 'produce-numbered');
      if ($GROUP_NAMES_BY_DIR{$GROUP} eq 'PRIVILEGED') {
        ($native, $numbered) = ('priv-native', 'priv-numbered');
      }
      my $nativeOutputDirOneUp = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $native, $rfp, $bucketName));
      my $numberedOutputDir = File::Spec->rel2abs(catfile($OUTPUT_TOPLEVEL_DIR, $numbered, $rfp, $bucketName, $typeName));
      unless (-d $nativeOutputDirOneUp) {
        make_path($nativeOutputDirOneUp, { mode => 0755 }) or die "unable to create path  $nativeOutputDirOneUp: $!";
      }
      unless (-d $numberedOutputDir) {
        make_path($numberedOutputDir, { mode => 0755 }) or die "unable to create path  $numberedOutputDir: $!";
      }
      my $destDir = catfile($nativeOutputDirOneUp, $typeName);
      if ($typeName =~ /email/i) {
        ProcessMailDir($rfp, $typeDirName, $numberedOutputDir);
      } else {
        ProcessDocumentDirectory($rfp, $typeDirName, $numberedOutputDir);
      }
      die "cannot move to the directory we want this in" unless -d $nativeOutputDirOneUp;
      rename($typeDirName, $destDir) or die "unable to move $typeDirName to $destDir: $!";
      #      move($typeDirName, $nativeOutputDirOneUp) or die "unable to move($typeDirName, $nativeOutputDirOneUp)";

      # Note: the above doesn't atually rename the directory from one place
      # to another; it moves the file contents into the destination directory.  IOW, File::Copy->move() doesn't have POSIX mv
    }
    closedir $bucketDH;
  }
  closedir $rfpDH;
}
closedir $topDH;

open my $csvFH, ">:encoding(utf8)", $csvLogFile;

$csvOutFormat->say($csvFH, \@headerFields);
$csvOutFormat->say($csvFH, $_) for @CSV_OUTPUT_ROWS; close $csvFH;

print STDERR "$GROUP ($GROUP_NAMES_BY_DIR{$GROUP}) starts at $upiStart and ends at $upiCurrentNum\n";
open($upiFH, ">", $upiNumberFile);
print $upiFH ++$upiCurrentNum, "\n";
close $upiFH;
###############################################################################
#
# Local variables:
# compile-command: "perl -c build-label-upi-number.plx"
# End: