Create normalize text script, which was ripped out from some work I did on unidiff.
This commit is contained in:
parent
c00b12b037
commit
2f2bfad041
1 changed files with 103 additions and 0 deletions
103
normalize-text.plx
Normal file
103
normalize-text.plx
Normal file
|
@ -0,0 +1,103 @@
|
|||
# Text.pm -*- Perl -*-
|
||||
#
|
||||
# Copyright (C) 2008, 2011 Bradley M. Kuhn.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of either the GNU General Public License; either Version
|
||||
# 1, or (at your option) any later version, or under the terms of the
|
||||
# Artistic License.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
# or FITNESS FOR A PARTICULAR PURPOSE. See either the GNU General Public
|
||||
# License or the Artistic License for more details.
|
||||
#
|
||||
# Please see the LICENSE file that was shipped with this distribution for
|
||||
# more details about the licensing of this program.
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Text::Autoformat qw(autoformat break_wrap);;
|
||||
|
||||
sub getText {
|
||||
my $text = shift;
|
||||
|
||||
my @lines = split(/\n/, $text);
|
||||
my $lines = \@lines;
|
||||
my $newText;
|
||||
|
||||
# First, fine out the average length of a line.
|
||||
my $count = 0;
|
||||
my $totLen = 0;
|
||||
for (my $ii = 0; $ii < @lines; $ii++) {
|
||||
$lines->[$ii] =~ s/^\s*//; $lines->[$ii] =~ s/\s*$//;
|
||||
if ($lines->[$ii] !~ /^\s*$/) {
|
||||
$count++;
|
||||
$totLen += length($lines->[$ii]);
|
||||
}
|
||||
}
|
||||
my $avgLen = $totLen / $count;
|
||||
|
||||
# Now, the loop that:
|
||||
# (a) tries to find paragraphs
|
||||
# (b) attempts to un-hyphenate words
|
||||
|
||||
my $inPara = 0;
|
||||
my $cutOffLen = $avgLen - 5;
|
||||
for (my $ii = 0; $ii < @lines; $ii++) {
|
||||
my $curLen = length($lines->[$ii]);
|
||||
if ($lines->[$ii] =~ /\s{10,}/ or
|
||||
($lines->[$ii] =~ /\s*\d+\.\s+/ and $curLen <= $cutOffLen)) {
|
||||
# Assume that any line that starts with ten spaces or more is a
|
||||
# title, heading or other stand alone unit of some sort.
|
||||
|
||||
$newText .= "\n\n" if ($newText !~ /\n\n$/s or $inPara);
|
||||
|
||||
$newText .= $lines->[$ii] . "\n";
|
||||
# Add another newline if one doesn't follow
|
||||
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
|
||||
$inPara = 0;
|
||||
next;
|
||||
}
|
||||
($lines->[$ii],$lines->[$ii+1]) =
|
||||
$self->_handleDeHyphen($lines->[$ii],$lines->[$ii+1])
|
||||
if ($lines->[$ii] =~ /\-$/);
|
||||
|
||||
$curLen = length($lines->[$ii]); # May have changed
|
||||
if ($curLen <= $cutOffLen) {
|
||||
$newText .= $lines->[$ii] . "\n";
|
||||
# Add another newline if one doesn't follow so the para is separated
|
||||
$newText .= "\n" unless $lines->[$ii+1] =~ /^\s*$/;
|
||||
$inPara = 0;
|
||||
} else {
|
||||
$newText .= $lines->[$ii] . " ";
|
||||
$inPara = 1;
|
||||
}
|
||||
}
|
||||
return autoformat($newText, {break=>break_wrap, all=>1, left=>0, right=>72});
|
||||
|
||||
}
|
||||
|
||||
sub _handleDeHyphen {
|
||||
my($self, $origFirstLine, $origSecondLine) = @_;
|
||||
my ($firstLine, $secondLine) = ($origFirstLine, $origSecondLine);
|
||||
if ($firstLine =~ s/^(.*\s+[\[\(,]*)(\S+)\-\s*$/$1/) {
|
||||
my $word = $2;
|
||||
if ($secondLine =~ s/^\s*(\w+)([\s\.\,\)\]]+)(.*)$/$3/) {
|
||||
$word .= $1;
|
||||
my $buffer = $2;
|
||||
my $firstLineRebuild = "$firstLine$word";
|
||||
$firstLineRebuild .= $buffer unless ($buffer =~ /^\s*$/);
|
||||
return ("$firstLineRebuild", $secondLine)
|
||||
if ($self->{speller}->check($word));
|
||||
}
|
||||
}
|
||||
return ($origFirstLine, $origSecondLine);
|
||||
}
|
||||
|
||||
|
||||
my $data;
|
||||
while (my $line = <>) {
|
||||
$data .= $line;
|
||||
}
|
||||
print getText($data);
|
Loading…
Add table
Reference in a new issue