diff --git a/normalize-text.plx b/normalize-text.plx new file mode 100644 index 0000000..df606d8 --- /dev/null +++ b/normalize-text.plx @@ -0,0 +1,103 @@ +# Text.pm -*- Perl -*- +# +# Copyright (C) 2008, 2011 Bradley M. Kuhn. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of either the GNU General Public License; either Version +# 1, or (at your option) any later version, or under the terms of the +# Artistic License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public +# License or the Artistic License for more details. +# +# Please see the LICENSE file that was shipped with this distribution for +# more details about the licensing of this program. +use strict; +use warnings; + +use Text::Autoformat qw(autoformat break_wrap);; + +sub getText { + my $text = shift; + + my @lines = split(/\n/, $text); + my $lines = \@lines; + my $newText; + + # First, fine out the average length of a line. + my $count = 0; + my $totLen = 0; + for (my $ii = 0; $ii < @lines; $ii++) { + $lines->[$ii] =~ s/^\s*//; $lines->[$ii] =~ s/\s*$//; + if ($lines->[$ii] !~ /^\s*$/) { + $count++; + $totLen += length($lines->[$ii]); + } + } + my $avgLen = $totLen / $count; + + # Now, the loop that: + # (a) tries to find paragraphs + # (b) attempts to un-hyphenate words + + my $inPara = 0; + my $cutOffLen = $avgLen - 5; + for (my $ii = 0; $ii < @lines; $ii++) { + my $curLen = length($lines->[$ii]); + if ($lines->[$ii] =~ /\s{10,}/ or + ($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) { + # Assume that any line that starts with ten spaces or more is a + # title, heading or other stand alone unit of some sort. + + $newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara); + + $newText .= $lines->[$ii] . "\n"; + # Add another newline if one doesn't follow + $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; + $inPara = 0; + next; + } + ($lines->[$ii],$lines->[$ii+1]) = + $self->_handleDeHyphen($lines->[$ii],$lines->[$ii+1]) + if ($lines->[$ii] =~ /\-$/); + + $curLen = length($lines->[$ii]); # May have changed + if ($curLen <= $cutOffLen) { + $newText .= $lines->[$ii] . "\n"; + # Add another newline if one doesn't follow so the para is separated + $newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/; + $inPara = 0; + } else { + $newText .= $lines->[$ii] . " "; + $inPara = 1; + } + } + return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72}); + +} + +sub _handleDeHyphen { + my($self, $origFirstLine, $origSecondLine) = @_; + my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine); + if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) { + my $word = $2; + if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) { + $word .= $1; + my $buffer = $2; + my $firstLineRebuild = "$firstLine$word"; + $firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/); + return ("$firstLineRebuild", $secondLine) + if ($self->{speller}->check($word)); + } + } + return ($origFirstLine, $origSecondLine); +} + + +my $data; +while (my $line = <>) { + $data .= $line; +} +print getText($data);