Create normalize text script, which was ripped out from some work I did on unidiff.

2011-08-11 07:59:34 -04:00 · 2011-08-11 07:59:34 -04:00 · 2f2bfad041
commit 2f2bfad041
parent c00b12b037
1 changed files with 103 additions and 0 deletions
--- a/normalize-text.plx
+++ b/normalize-text.plx
@ -0,0 +1,103 @@
+# Text.pm                                                          -*- Perl -*-
+#
+#   Copyright (C) 2008, 2011 Bradley M. Kuhn.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of either the GNU General Public License; either Version
+# 1, or (at your option) any later version, or under the terms of the
+# Artistic License.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See either the GNU General Public
+# License or the Artistic License for more details.
+#
+# Please see the LICENSE file that was shipped with this distribution for
+# more details about the licensing of this program.
+use strict;
+use warnings;
+
+use Text::Autoformat  qw(autoformat break_wrap);;
+
+sub getText {
+  my $text = shift;
+
+  my @lines = split(/\n/, $text);
+  my $lines = \@lines;
+  my $newText;
+
+  # First, fine out the average length of a line.
+  my $count = 0;
+  my $totLen = 0;
+  for (my $ii = 0; $ii < @lines; $ii++) {
+    $lines->[$ii] =~ s/^\s*//;  $lines->[$ii] =~ s/\s*$//;
+    if ($lines->[$ii] !~ /^\s*$/) {
+      $count++;
+      $totLen += length($lines->[$ii]);
+    }
+  }
+  my $avgLen = $totLen / $count;
+
+  # Now, the loop that:
+  #    (a) tries to find paragraphs
+  #    (b) attempts to un-hyphenate words
+
+  my $inPara = 0;
+  my $cutOffLen = $avgLen - 5;
+  for (my $ii = 0; $ii < @lines; $ii++) {
+    my $curLen = length($lines->[$ii]);
+    if ($lines->[$ii] =~ /\s{10,}/ or
+        ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
+      # Assume that any line that starts with ten spaces or more is a
+      # title, heading or other stand alone unit of some sort.
+
+      $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);
+
+      $newText .= $lines->[$ii] . "\n";
+      # Add another newline if one doesn't follow
+      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
+      $inPara = 0;
+      next;
+    }
+    ($lines->[$ii],$lines->[$ii+1]) =
+      $self->_handleDeHyphen($lines->[$ii],$lines->[$ii+1])
+        if ($lines->[$ii] =~ /\-$/);
+
+    $curLen = length($lines->[$ii]);  # May have changed
+    if ($curLen <= $cutOffLen) {
+      $newText .= $lines->[$ii] . "\n";
+      # Add another newline if one doesn't follow so the para is separated
+      $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
+      $inPara = 0;
+    } else {
+      $newText .= $lines->[$ii] . " ";
+      $inPara = 1;
+    }
+  }
+  return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});
+
+}
+
+sub _handleDeHyphen {
+  my($self, $origFirstLine, $origSecondLine) = @_;
+  my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
+  if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
+    my $word = $2;
+    if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
+      $word .= $1;
+      my $buffer = $2;
+      my $firstLineRebuild = "$firstLine$word";
+      $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
+      return ("$firstLineRebuild", $secondLine)
+        if ($self->{speller}->check($word));
+    }
+  }
+  return ($origFirstLine, $origSecondLine);
+}
+
+
+my $data;
+while (my $line = <>) {
+  $data .= $line;
+}
+print getText($data);