small-hacks/normalize-text.plx
2011-11-01 12:13:01 -04:00

108 lines
3.3 KiB
Perl
Executable file

#!/usr/bin/perl
# Text.pm -*- Perl -*-
#
# Copyright (C) 2008, 2011 Bradley M. Kuhn.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU General Public License; either Version
# 1, or (at your option) any later version, or under the terms of the
# Artistic License.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public
# License or the Artistic License for more details.
#
# Please see the LICENSE file that was shipped with this distribution for
# more details about the licensing of this program.
use strict;
use warnings;
use Text::Autoformat qw(autoformat break_wrap);;
sub getText {
my $text = shift;
my @lines = split(/\n/, $text);
my $lines = \@lines;
my $newText;
# First, fine out the average length of a line.
my $count = 0;
my $totLen = 0;
for (my $ii = 0; $ii < @lines; $ii++) {
$lines->[$ii] =~ s/^\s*//; $lines->[$ii] =~ s/\s*$//;
if ($lines->[$ii] !~ /^\s*$/) {
$count++;
$totLen += length($lines->[$ii]);
}
}
my $avgLen = $totLen / $count;
# Now, the loop that:
# (a) tries to find paragraphs
# (b) attempts to un-hyphenate words
my $inPara = 0;
my $cutOffLen = $avgLen - 5;
for (my $ii = 0; $ii < @lines; $ii++) {
my $curLen = length($lines->[$ii]);
if ($lines->[$ii] =~ /\s{10,}/ or
($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
# Assume that any line that starts with ten spaces or more is a
# title, heading or other stand alone unit of some sort.
$newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);
$newText .= $lines->[$ii] . "\n";
# Add another newline if one doesn't follow
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
$inPara = 0;
next;
}
($lines->[$ii],$lines->[$ii+1]) =
_handleDeHyphen($lines->[$ii],$lines->[$ii+1])
if ($lines->[$ii] =~ /\-$/);
$curLen = length($lines->[$ii]); # May have changed
if ($curLen <= $cutOffLen) {
$newText .= $lines->[$ii] . "\n";
# Add another newline if one doesn't follow so the para is separated
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
$inPara = 0;
} else {
$newText .= $lines->[$ii] . " ";
$inPara = 1;
}
}
return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});
}
sub _handleDeHyphen {
my($self, $origFirstLine, $origSecondLine) = @_;
my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
my $word = $2;
if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
$word .= $1;
my $buffer = $2;
my $firstLineRebuild = "$firstLine$word";
$firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
return ("$firstLineRebuild", $secondLine)
if ($self->{speller}->check($word));
}
}
return ($origFirstLine, $origSecondLine);
}
my $data;
while (my $line = <>) {
$data .= $line;
}
print getText($data);
###############################################################################
# Local variables:
# compile-command: "perl -c normalize-text.plx"
# End: