small-hacks/normalize-text.plx

#!/usr/bin/perl
# Text.pm                                                          -*- Perl -*-
#
#   Copyright (C) 2008, 2011 Bradley M. Kuhn.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU General Public License; either Version
# 1, or (at your option) any later version, or under the terms of the
# Artistic License.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See either the GNU General Public
# License or the Artistic License for more details.
#
# Please see the LICENSE file that was shipped with this distribution for
# more details about the licensing of this program.
use strict;
use warnings;

use Text::Autoformat  qw(autoformat break_wrap);;

sub getText {
  my $text = shift;

  my @lines = split(/\n/, $text);
  my $lines = \@lines;
  my $newText;

  # First, fine out the average length of a line.
  my $count = 0;
  my $totLen = 0;
  for (my $ii = 0; $ii < @lines; $ii++) {
    $lines->[$ii] =~ s/^\s*//;  $lines->[$ii] =~ s/\s*$//;
    if ($lines->[$ii] !~ /^\s*$/) {
      $count++;
      $totLen += length($lines->[$ii]);
    }
  }
  my $avgLen = $totLen / $count;

  # Now, the loop that:
  #    (a) tries to find paragraphs
  #    (b) attempts to un-hyphenate words

  my $inPara = 0;
  my $cutOffLen = $avgLen - 5;
  for (my $ii = 0; $ii < @lines; $ii++) {
    my $curLen = length($lines->[$ii]);
    if ($lines->[$ii] =~ /\s{10,}/ or
        ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
      # Assume that any line that starts with ten spaces or more is a
      # title, heading or other stand alone unit of some sort.

      $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);

      $newText .= $lines->[$ii] . "\n";
      # Add another newline if one doesn't follow
      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
      $inPara = 0;
      next;
    }
    ($lines->[$ii],$lines->[$ii+1]) =
      _handleDeHyphen($lines->[$ii],$lines->[$ii+1])
        if ($lines->[$ii] =~ /\-$/);

    $curLen = length($lines->[$ii]);  # May have changed
    if ($curLen <= $cutOffLen) {
      $newText .= $lines->[$ii] . "\n";
      # Add another newline if one doesn't follow so the para is separated
      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
      $inPara = 0;
    } else {
      $newText .= $lines->[$ii] . " ";
      $inPara = 1;
    }
  }
  return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});

}

sub _handleDeHyphen {
  my($self, $origFirstLine, $origSecondLine) = @_;
  my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
  if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
    my $word = $2;
    if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
      $word .= $1;
      my $buffer = $2;
      my $firstLineRebuild = "$firstLine$word";
      $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
      return ("$firstLineRebuild", $secondLine)
        if ($self->{speller}->check($word));
    }
  }
  return ($origFirstLine, $origSecondLine);
}


my $data;
while (my $line = <>) {
  $data .= $line;
}
print getText($data);
###############################################################################
# Local variables:
# compile-command: "perl -c normalize-text.plx"
# End:
Minor fixes to normalize text. 2011-11-01 12:13:01 -04:00			`#!/usr/bin/perl`
Create normalize text script, which was ripped out from some work I did on unidiff. 2011-08-11 07:59:34 -04:00			`# Text.pm -- Perl --`
			`#`
			`# Copyright (C) 2008, 2011 Bradley M. Kuhn.`
			`#`
			`# This program is free software; you can redistribute it and/or modify it`
			`# under the terms of either the GNU General Public License; either Version`
			`# 1, or (at your option) any later version, or under the terms of the`
			`# Artistic License.`
			`#`
			`# This program is distributed in the hope that it will be useful, but`
			`# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`# or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public`
			`# License or the Artistic License for more details.`
			`#`
			`# Please see the LICENSE file that was shipped with this distribution for`
			`# more details about the licensing of this program.`
			`use strict;`
			`use warnings;`

			`use Text::Autoformat qw(autoformat break_wrap);;`

			`sub getText {`
			`my $text = shift;`

			`my @lines = split(/\n/, $text);`
			`my $lines = \@lines;`
			`my $newText;`

			`# First, fine out the average length of a line.`
			`my $count = 0;`
			`my $totLen = 0;`
			`for (my $ii = 0; $ii < @lines; $ii++) {`
			`$lines->[$ii] =~ s/^\s//; $lines->[$ii] =~ s/\s$//;`
			`if ($lines->[$ii] !~ /^\s*$/) {`
			`$count++;`
			`$totLen += length($lines->[$ii]);`
			`}`
			`}`
			`my $avgLen = $totLen / $count;`

			`# Now, the loop that:`
			`# (a) tries to find paragraphs`
			`# (b) attempts to un-hyphenate words`

			`my $inPara = 0;`
			`my $cutOffLen = $avgLen - 5;`
			`for (my $ii = 0; $ii < @lines; $ii++) {`
			`my $curLen = length($lines->[$ii]);`
			`if ($lines->[$ii] =~ /\s{10,}/ or`
			`($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {`
			`# Assume that any line that starts with ten spaces or more is a`
			`# title, heading or other stand alone unit of some sort.`

			`$newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);`

			`$newText .= $lines->[$ii] . "\n";`
			`# Add another newline if one doesn't follow`
			`$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;`
			`$inPara = 0;`
			`next;`
			`}`
			`($lines->[$ii],$lines->[$ii+1]) =`
Minor fixes to normalize text. 2011-11-01 12:13:01 -04:00			`_handleDeHyphen($lines->[$ii],$lines->[$ii+1])`
Create normalize text script, which was ripped out from some work I did on unidiff. 2011-08-11 07:59:34 -04:00			`if ($lines->[$ii] =~ /\-$/);`

			`$curLen = length($lines->[$ii]); # May have changed`
			`if ($curLen <= $cutOffLen) {`
			`$newText .= $lines->[$ii] . "\n";`
			`# Add another newline if one doesn't follow so the para is separated`
			`$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;`
			`$inPara = 0;`
			`} else {`
			`$newText .= $lines->[$ii] . " ";`
			`$inPara = 1;`
			`}`
			`}`
			`return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});`

			`}`

			`sub _handleDeHyphen {`
			`my($self, $origFirstLine, $origSecondLine) = @_;`
			`my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);`
			`if ($firstLine =~ s/^(.\s+[\[\(,])(\S+)\-\s*$/$1/) {`
			`my $word = $2;`
			`if ($secondLine =~ s/^\s(\w+)([\s\.\,\)\]]+)(.)$/$3/) {`
			`$word .= $1;`
			`my $buffer = $2;`
			`my $firstLineRebuild = "$firstLine$word";`
			`$firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);`
			`return ("$firstLineRebuild", $secondLine)`
			`if ($self->{speller}->check($word));`
			`}`
			`}`
			`return ($origFirstLine, $origSecondLine);`
			`}`


			`my $data;`
			`while (my $line = <>) {`
			`$data .= $line;`
			`}`
			`print getText($data);`
Minor fixes to normalize text. 2011-11-01 12:13:01 -04:00			`###############################################################################`
			`# Local variables:`
			`# compile-command: "perl -c normalize-text.plx"`
			`# End:`