small-hacks/normalize-text.plx

#!/usr/bin/perl
# Text.pm                                                          -*- Perl -*-
#
#   Copyright (C) 2008, 2011 Bradley M. Kuhn.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU General Public License; either Version
# 1, or (at your option) any later version, or under the terms of the
# Artistic License.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See either the GNU General Public
# License or the Artistic License for more details.
#
# Please see the LICENSE file that was shipped with this distribution for
# more details about the licensing of this program.
use strict;
use warnings;

use Text::Autoformat  qw(autoformat break_wrap);;

sub getText {
  my $text = shift;

  my @lines = split(/\n/, $text);
  my $lines = \@lines;
  my $newText;

  # First, fine out the average length of a line.
  my $count = 0;
  my $totLen = 0;
  for (my $ii = 0; $ii < @lines; $ii++) {
    $lines->[$ii] =~ s/^\s*//;  $lines->[$ii] =~ s/\s*$//;
    if ($lines->[$ii] !~ /^\s*$/) {
      $count++;
      $totLen += length($lines->[$ii]);
    }
  }
  my $avgLen = $totLen / $count;

  # Now, the loop that:
  #    (a) tries to find paragraphs
  #    (b) attempts to un-hyphenate words

  my $inPara = 0;
  my $cutOffLen = $avgLen - 5;
  for (my $ii = 0; $ii < @lines; $ii++) {
    my $curLen = length($lines->[$ii]);
    if ($lines->[$ii] =~ /\s{10,}/ or
        ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
      # Assume that any line that starts with ten spaces or more is a
      # title, heading or other stand alone unit of some sort.

      $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);

      $newText .= $lines->[$ii] . "\n";
      # Add another newline if one doesn't follow
      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
      $inPara = 0;
      next;
    }
    ($lines->[$ii],$lines->[$ii+1]) =
      _handleDeHyphen($lines->[$ii],$lines->[$ii+1])
        if ($lines->[$ii] =~ /\-$/);

    $curLen = length($lines->[$ii]);  # May have changed
    if ($curLen <= $cutOffLen) {
      $newText .= $lines->[$ii] . "\n";
      # Add another newline if one doesn't follow so the para is separated
      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
      $inPara = 0;
    } else {
      $newText .= $lines->[$ii] . " ";
      $inPara = 1;
    }
  }
  return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});

}

sub _handleDeHyphen {
  my($self, $origFirstLine, $origSecondLine) = @_;
  my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
  if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
    my $word = $2;
    if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
      $word .= $1;
      my $buffer = $2;
      my $firstLineRebuild = "$firstLine$word";
      $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
      return ("$firstLineRebuild", $secondLine)
        if ($self->{speller}->check($word));
    }
  }
  return ($origFirstLine, $origSecondLine);
}


my $data;
while (my $line = <>) {
  $data .= $line;
}
print getText($data);
###############################################################################
# Local variables:
# compile-command: "perl -c normalize-text.plx"
# End: