2011-11-01 12:13:01 -04:00
|
|
|
#!/usr/bin/perl
|
2011-08-11 07:59:34 -04:00
|
|
|
# Text.pm -*- Perl -*-
|
|
|
|
#
|
|
|
|
# Copyright (C) 2008, 2011 Bradley M. Kuhn.
|
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify it
|
|
|
|
# under the terms of either the GNU General Public License; either Version
|
|
|
|
# 1, or (at your option) any later version, or under the terms of the
|
|
|
|
# Artistic License.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful, but
|
|
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
|
|
# or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public
|
|
|
|
# License or the Artistic License for more details.
|
|
|
|
#
|
|
|
|
# Please see the LICENSE file that was shipped with this distribution for
|
|
|
|
# more details about the licensing of this program.
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
|
|
|
|
use Text::Autoformat qw(autoformat break_wrap);;
|
|
|
|
|
|
|
|
sub getText {
|
|
|
|
my $text = shift;
|
|
|
|
|
|
|
|
my @lines = split(/\n/, $text);
|
|
|
|
my $lines = \@lines;
|
|
|
|
my $newText;
|
|
|
|
|
|
|
|
# First, fine out the average length of a line.
|
|
|
|
my $count = 0;
|
|
|
|
my $totLen = 0;
|
|
|
|
for (my $ii = 0; $ii < @lines; $ii++) {
|
|
|
|
$lines->[$ii] =~ s/^\s*//; $lines->[$ii] =~ s/\s*$//;
|
|
|
|
if ($lines->[$ii] !~ /^\s*$/) {
|
|
|
|
$count++;
|
|
|
|
$totLen += length($lines->[$ii]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
my $avgLen = $totLen / $count;
|
|
|
|
|
|
|
|
# Now, the loop that:
|
|
|
|
# (a) tries to find paragraphs
|
|
|
|
# (b) attempts to un-hyphenate words
|
|
|
|
|
|
|
|
my $inPara = 0;
|
|
|
|
my $cutOffLen = $avgLen - 5;
|
|
|
|
for (my $ii = 0; $ii < @lines; $ii++) {
|
|
|
|
my $curLen = length($lines->[$ii]);
|
|
|
|
if ($lines->[$ii] =~ /\s{10,}/ or
|
|
|
|
($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
|
|
|
|
# Assume that any line that starts with ten spaces or more is a
|
|
|
|
# title, heading or other stand alone unit of some sort.
|
|
|
|
|
|
|
|
$newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);
|
|
|
|
|
|
|
|
$newText .= $lines->[$ii] . "\n";
|
|
|
|
# Add another newline if one doesn't follow
|
|
|
|
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
|
|
|
|
$inPara = 0;
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
($lines->[$ii],$lines->[$ii+1]) =
|
2011-11-01 12:13:01 -04:00
|
|
|
_handleDeHyphen($lines->[$ii],$lines->[$ii+1])
|
2011-08-11 07:59:34 -04:00
|
|
|
if ($lines->[$ii] =~ /\-$/);
|
|
|
|
|
|
|
|
$curLen = length($lines->[$ii]); # May have changed
|
|
|
|
if ($curLen <= $cutOffLen) {
|
|
|
|
$newText .= $lines->[$ii] . "\n";
|
|
|
|
# Add another newline if one doesn't follow so the para is separated
|
|
|
|
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
|
|
|
|
$inPara = 0;
|
|
|
|
} else {
|
|
|
|
$newText .= $lines->[$ii] . " ";
|
|
|
|
$inPara = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
sub _handleDeHyphen {
|
|
|
|
my($self, $origFirstLine, $origSecondLine) = @_;
|
|
|
|
my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
|
|
|
|
if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
|
|
|
|
my $word = $2;
|
|
|
|
if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
|
|
|
|
$word .= $1;
|
|
|
|
my $buffer = $2;
|
|
|
|
my $firstLineRebuild = "$firstLine$word";
|
|
|
|
$firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
|
|
|
|
return ("$firstLineRebuild", $secondLine)
|
|
|
|
if ($self->{speller}->check($word));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ($origFirstLine, $origSecondLine);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
my $data;
|
|
|
|
while (my $line = <>) {
|
|
|
|
$data .= $line;
|
|
|
|
}
|
|
|
|
print getText($data);
|
2011-11-01 12:13:01 -04:00
|
|
|
###############################################################################
|
|
|
|
# Local variables:
|
|
|
|
# compile-command: "perl -c normalize-text.plx"
|
|
|
|
# End:
|