diff options
author | rolandsteiner@chromium.org <rolandsteiner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-10-29 06:05:56 +0000 |
---|---|---|
committer | rolandsteiner@chromium.org <rolandsteiner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-10-29 06:05:56 +0000 |
commit | dff8c1108d0b414ea73ae0e0d0d904e09856cdd6 (patch) | |
tree | adae2223b4034afb2f017346f2cec11bd32bb4e3 /third_party/hyphen/substrings.pl | |
parent | 906eed0d5bb752a29adbfa86054df1bebd507a80 (diff) | |
download | chromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.zip chromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.tar.gz chromium_src-dff8c1108d0b414ea73ae0e0d0d904e09856cdd6.tar.bz2 |
Add Hunspell 'Hyphen' hyphenation library.
BUG=60895
TEST=none
Review URL: http://codereview.chromium.org/4143003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@64368 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party/hyphen/substrings.pl')
-rwxr-xr-x | third_party/hyphen/substrings.pl | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/third_party/hyphen/substrings.pl b/third_party/hyphen/substrings.pl new file mode 100755 index 0000000..00a02c2 --- /dev/null +++ b/third_party/hyphen/substrings.pl @@ -0,0 +1,169 @@ +#!/usr/bin/perl +# convert TeX (Patgen) hyphenation patterns to Libhnj format +# (A utility for finding substring embeddings in patterns) +# usage: substrings.pl inputfile outputfile [encoding] + +if (!defined $ARGV[1]) { + print "" . +"substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" . +"(A utility for finding substring embeddings in patterns)\n" . +"usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n"; + exit 1; +} +$fn = $ARGV[0]; +if (!-e $fn) { $fn = "hyphen.us"; } +open HYPH, $fn; +open OUT, ">$ARGV[1]"; +$encoding = $ARGV[2]; +$lhmin = $ARGV[3]; +$rhmin = $ARGV[4]; +if (defined $encoding) { print OUT "$encoding\n"; } +if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; } +if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; } + +while (<HYPH>) +{ + $pat =~ s/%.*$//g; + if (/^\%/) { + #comment, ignore + } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) { + $origpat = $1; + $pat = $1; + $repl = $2; + $beg = $3; + $len = $4; + $pat =~ s/\d//g; + if ($origpat eq $pat) { + print "error - missing hyphenation point: $_"; + exit 1; + } + push @patlist, $pat; + $pattab{$pat} = $origpat; + $repltab{$pat} = $repl; + $replbeg{$pat} = $beg - 1; + $repllen{$pat} = $len; + } elsif (/^(.+)\/(.+)$/) { + $origpat = $1; + $pat = $1; + $repl = $2; + $pat =~ s/\d//g; + if ($origpat eq $pat) { + print "error - missing hyphenation point: $_"; + exit 1; + } + push @patlist, $pat; + $pattab{$pat} = $origpat; + $repltab{$pat} = $repl; + $replbeg{$pat} = 0; + $repllen{$pat} = enclen($pat); + } elsif (/^(.+)$/) { + $origpat = $1; + $pat = $1; + $pat =~ s/\d//g; + push @patlist, $pat; + $pattab{$pat} = $origpat; + } +} + +foreach $pat (@patlist) { + $patsize = length $pat; + for $i (0..$patsize - 1) { + for $j (1..$patsize - $i) { + $subpat = substr ($pat, $i, $j); + if (defined $pattab{$subpat}) { + print "$pattab{$subpat} is embedded in $pattab{$pat}\n"; + $newpat = substr $pat, 0, $i + $j; + if (!defined $newpattab{$newpat}) { + $newpattab{$newpat} = + substr ($pat, 0, $i).$pattab{$subpat}; + $ss = substr $pat, 0, $i; + print "$ss+$pattab{$subpat}\n"; + push @newpatlist, $newpat; + if (defined $repltab{$subpat}) { + $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0; + $newrepltab{$newpat} = $repltab{$subpat}; + $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr; + $newrepllen{$newpat} = $repllen{$subpat}; + } + } else { + $tmp = $newpattab{$newpat}; + $newpattab{$newpat} = + combine ($newpattab{$newpat}, $pattab{$subpat}); + print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n"; + } + } + } + } +} + +foreach $pat (@newpatlist) { + if (defined $newrepltab{$pat}) { + print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n"; + } else { + print OUT $newpattab{$pat}."\n"; + } +} + +#convert 'n1im' to 0n1i0m0 expresed as a list +sub expand { + my ($pat) = @_; + my $last = '.'; + my @exp = (); + + foreach $c (split (//, $pat)) { + if ($last =~ /[\D]/ && $c =~ /[\D]/) { + push @exp, 0; + } + push @exp, $c; + $last = $c; + } + if ($last =~ /[\D]/) { + push @exp, 0; + } + return @exp; +} + +# Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der +# The second pattern needs to be a substring of the first (modulo digits) +sub combine { + my @exp = expand shift; + my @subexp = expand shift; + my $pat1, $pat2; + my $i; + + $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp); + $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp); + + $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0; + + for $i (0..length ($pat1) - length ($pat2)) { + if (substr ($pat1, $i, length $pat2) eq $subpat) { + for ($j = 0; $j < @subexp; $j += 2) { + if ($subexp[$j] > $exp[2 * $i + $j]) { + $exp[2 * $i + $j] = $subexp[$j]; + if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) { + $ss = substr ($pat1, 0, $i); + $newrepltab{$pat1} = $newrepltab{$pat2}; + $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr; + $newrepllen{$pat1} = $newrepllen{$pat2}; + } + } + } + print ("$pat1 includes $pat2 at pos $i\n"); + } + } + return join ('', map { $_ eq '0' ? () : $_ } @exp); +} + +# 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation) +sub enclen { + my $nonchar = 0; + my $len = length($_[0]); + if ($encoding eq "UTF-8") { + # length of an UTF-8 string equals to the count of the characters not started with '10' bits + for ($i = 0; $i < $len; $i++) { + if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; } + } + } + return $len - $nonchar; +} |