summaryrefslogtreecommitdiffstats
path: root/third_party/hyphen/substrings.pl
diff options
context:
space:
mode:
authorrolandsteiner@chromium.org <rolandsteiner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-10-29 06:05:56 +0000
committerrolandsteiner@chromium.org <rolandsteiner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-10-29 06:05:56 +0000
commitdff8c1108d0b414ea73ae0e0d0d904e09856cdd6 (patch)
treeadae2223b4034afb2f017346f2cec11bd32bb4e3 /third_party/hyphen/substrings.pl
parent906eed0d5bb752a29adbfa86054df1bebd507a80 (diff)
downloadchromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.zip
chromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.tar.gz
chromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.tar.bz2
Add Hunspell 'Hyphen' hyphenation library.
BUG=60895 TEST=none Review URL: http://codereview.chromium.org/4143003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@64368 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party/hyphen/substrings.pl')
-rwxr-xr-xthird_party/hyphen/substrings.pl169
1 files changed, 169 insertions, 0 deletions
diff --git a/third_party/hyphen/substrings.pl b/third_party/hyphen/substrings.pl
new file mode 100755
index 0000000..00a02c2
--- /dev/null
+++ b/third_party/hyphen/substrings.pl
@@ -0,0 +1,169 @@
+#!/usr/bin/perl
+# convert TeX (Patgen) hyphenation patterns to Libhnj format
+# (A utility for finding substring embeddings in patterns)
+# usage: substrings.pl inputfile outputfile [encoding]
+
+if (!defined $ARGV[1]) {
+ print "" .
+"substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
+"(A utility for finding substring embeddings in patterns)\n" .
+"usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n";
+ exit 1;
+}
+$fn = $ARGV[0];
+if (!-e $fn) { $fn = "hyphen.us"; }
+open HYPH, $fn;
+open OUT, ">$ARGV[1]";
+$encoding = $ARGV[2];
+$lhmin = $ARGV[3];
+$rhmin = $ARGV[4];
+if (defined $encoding) { print OUT "$encoding\n"; }
+if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
+if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
+
+while (<HYPH>)
+{
+ $pat =~ s/%.*$//g;
+ if (/^\%/) {
+ #comment, ignore
+ } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
+ $origpat = $1;
+ $pat = $1;
+ $repl = $2;
+ $beg = $3;
+ $len = $4;
+ $pat =~ s/\d//g;
+ if ($origpat eq $pat) {
+ print "error - missing hyphenation point: $_";
+ exit 1;
+ }
+ push @patlist, $pat;
+ $pattab{$pat} = $origpat;
+ $repltab{$pat} = $repl;
+ $replbeg{$pat} = $beg - 1;
+ $repllen{$pat} = $len;
+ } elsif (/^(.+)\/(.+)$/) {
+ $origpat = $1;
+ $pat = $1;
+ $repl = $2;
+ $pat =~ s/\d//g;
+ if ($origpat eq $pat) {
+ print "error - missing hyphenation point: $_";
+ exit 1;
+ }
+ push @patlist, $pat;
+ $pattab{$pat} = $origpat;
+ $repltab{$pat} = $repl;
+ $replbeg{$pat} = 0;
+ $repllen{$pat} = enclen($pat);
+ } elsif (/^(.+)$/) {
+ $origpat = $1;
+ $pat = $1;
+ $pat =~ s/\d//g;
+ push @patlist, $pat;
+ $pattab{$pat} = $origpat;
+ }
+}
+
+foreach $pat (@patlist) {
+ $patsize = length $pat;
+ for $i (0..$patsize - 1) {
+ for $j (1..$patsize - $i) {
+ $subpat = substr ($pat, $i, $j);
+ if (defined $pattab{$subpat}) {
+ print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
+ $newpat = substr $pat, 0, $i + $j;
+ if (!defined $newpattab{$newpat}) {
+ $newpattab{$newpat} =
+ substr ($pat, 0, $i).$pattab{$subpat};
+ $ss = substr $pat, 0, $i;
+ print "$ss+$pattab{$subpat}\n";
+ push @newpatlist, $newpat;
+ if (defined $repltab{$subpat}) {
+ $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
+ $newrepltab{$newpat} = $repltab{$subpat};
+ $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
+ $newrepllen{$newpat} = $repllen{$subpat};
+ }
+ } else {
+ $tmp = $newpattab{$newpat};
+ $newpattab{$newpat} =
+ combine ($newpattab{$newpat}, $pattab{$subpat});
+ print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
+ }
+ }
+ }
+ }
+}
+
+foreach $pat (@newpatlist) {
+ if (defined $newrepltab{$pat}) {
+ print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n";
+ } else {
+ print OUT $newpattab{$pat}."\n";
+ }
+}
+
+#convert 'n1im' to 0n1i0m0 expresed as a list
+sub expand {
+ my ($pat) = @_;
+ my $last = '.';
+ my @exp = ();
+
+ foreach $c (split (//, $pat)) {
+ if ($last =~ /[\D]/ && $c =~ /[\D]/) {
+ push @exp, 0;
+ }
+ push @exp, $c;
+ $last = $c;
+ }
+ if ($last =~ /[\D]/) {
+ push @exp, 0;
+ }
+ return @exp;
+}
+
+# Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
+# The second pattern needs to be a substring of the first (modulo digits)
+sub combine {
+ my @exp = expand shift;
+ my @subexp = expand shift;
+ my $pat1, $pat2;
+ my $i;
+
+ $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
+ $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
+
+ $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
+
+ for $i (0..length ($pat1) - length ($pat2)) {
+ if (substr ($pat1, $i, length $pat2) eq $subpat) {
+ for ($j = 0; $j < @subexp; $j += 2) {
+ if ($subexp[$j] > $exp[2 * $i + $j]) {
+ $exp[2 * $i + $j] = $subexp[$j];
+ if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) {
+ $ss = substr ($pat1, 0, $i);
+ $newrepltab{$pat1} = $newrepltab{$pat2};
+ $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
+ $newrepllen{$pat1} = $newrepllen{$pat2};
+ }
+ }
+ }
+ print ("$pat1 includes $pat2 at pos $i\n");
+ }
+ }
+ return join ('', map { $_ eq '0' ? () : $_ } @exp);
+}
+
+# 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation)
+sub enclen {
+ my $nonchar = 0;
+ my $len = length($_[0]);
+ if ($encoding eq "UTF-8") {
+ # length of an UTF-8 string equals to the count of the characters not started with '10' bits
+ for ($i = 0; $i < $len; $i++) {
+ if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
+ }
+ }
+ return $len - $nonchar;
+}