diff options
Diffstat (limited to 'third_party/hyphen/hyphen.c')
-rw-r--r-- | third_party/hyphen/hyphen.c | 1084 |
1 files changed, 0 insertions, 1084 deletions
diff --git a/third_party/hyphen/hyphen.c b/third_party/hyphen/hyphen.c deleted file mode 100644 index ebae510..0000000 --- a/third_party/hyphen/hyphen.c +++ /dev/null @@ -1,1084 +0,0 @@ -/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both - * licenses follows. - */ - -/* LibHnj - a library for high quality hyphenation and justification - * Copyright (C) 1998 Raph Levien, - * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), - * (C) 2001 Peter Novodvorsky (nidd@cs.msu.su) - * (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307 USA. -*/ - -/* - * The contents of this file are subject to the Mozilla Public License - * Version 1.0 (the "MPL"); you may not use this file except in - * compliance with the MPL. You may obtain a copy of the MPL at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the MPL is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL - * for the specific language governing rights and limitations under the - * MPL. - * - */ -#include <stdlib.h> /* for NULL, malloc */ -#include <stdio.h> /* for fprintf */ -#include <string.h> /* for strdup */ - -#ifdef UNX -#include <unistd.h> /* for exit */ -#endif - -#define noVERBOSE - -/* calculate hyphenmin values with long ligature length (2 or 3 characters - * instead of 1 or 2) for comparison with hyphenation without ligatures */ -#define noLONG_LIGATURE - -#ifdef LONG_LIGATURE -#define LIG_xx 1 -#define LIG_xxx 2 -#else -#define LIG_xx 0 -#define LIG_xxx 1 -#endif - -#include "hnjalloc.h" -#include "hyphen.h" - -static char * -hnj_strdup (const char *s) -{ - char *new; - int l; - - l = strlen (s); - new = hnj_malloc (l + 1); - memcpy (new, s, l); - new[l] = 0; - return new; -} - -/* remove cross-platform text line end characters */ -void hnj_strchomp(char * s) -{ - int k = strlen(s); - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; -} - -/* a little bit of a hash table implementation. This simply maps strings - to state numbers */ - -typedef struct _HashTab HashTab; -typedef struct _HashEntry HashEntry; - -/* A cheap, but effective, hack. */ -#define HASH_SIZE 31627 - -struct _HashTab { - HashEntry *entries[HASH_SIZE]; -}; - -struct _HashEntry { - HashEntry *next; - char *key; - int val; -}; - -/* a char* hash function from ASU - adapted from Gtk+ */ -static unsigned int -hnj_string_hash (const char *s) -{ - const char *p; - unsigned int h=0, g; - for(p = s; *p != '\0'; p += 1) { - h = ( h << 4 ) + *p; - if ( ( g = h & 0xf0000000 ) ) { - h = h ^ (g >> 24); - h = h ^ g; - } - } - return h /* % M */; -} - -static HashTab * -hnj_hash_new (void) -{ - HashTab *hashtab; - int i; - - hashtab = hnj_malloc (sizeof(HashTab)); - for (i = 0; i < HASH_SIZE; i++) - hashtab->entries[i] = NULL; - - return hashtab; -} - -static void -hnj_hash_free (HashTab *hashtab) -{ - int i; - HashEntry *e, *next; - - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = next) - { - next = e->next; - hnj_free (e->key); - hnj_free (e); - } - - hnj_free (hashtab); -} - -/* assumes that key is not already present! */ -static void -hnj_hash_insert (HashTab *hashtab, const char *key, int val) -{ - int i; - HashEntry *e; - - i = hnj_string_hash (key) % HASH_SIZE; - e = hnj_malloc (sizeof(HashEntry)); - e->next = hashtab->entries[i]; - e->key = hnj_strdup (key); - e->val = val; - hashtab->entries[i] = e; -} - -/* return val if found, otherwise -1 */ -static int -hnj_hash_lookup (HashTab *hashtab, const char *key) -{ - int i; - HashEntry *e; - i = hnj_string_hash (key) % HASH_SIZE; - for (e = hashtab->entries[i]; e; e = e->next) - if (!strcmp (key, e->key)) - return e->val; - return -1; -} - -/* Get the state number, allocating a new state if necessary. */ -static int -hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) -{ - int state_num; - - state_num = hnj_hash_lookup (hashtab, string); - - if (state_num >= 0) - return state_num; - - hnj_hash_insert (hashtab, string, dict->num_states); - /* predicate is true if dict->num_states is a power of two */ - if (!(dict->num_states & (dict->num_states - 1))) - { - dict->states = hnj_realloc (dict->states, - (dict->num_states << 1) * - sizeof(HyphenState)); - } - dict->states[dict->num_states].match = NULL; - dict->states[dict->num_states].repl = NULL; - dict->states[dict->num_states].fallback_state = -1; - dict->states[dict->num_states].num_trans = 0; - dict->states[dict->num_states].trans = NULL; - return dict->num_states++; -} - -/* add a transition from state1 to state2 through ch - assumes that the - transition does not already exist */ -static void -hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) -{ - int num_trans; - - num_trans = dict->states[state1].num_trans; - if (num_trans == 0) - { - dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); - } - else if (!(num_trans & (num_trans - 1))) - { - dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, - (num_trans << 1) * - sizeof(HyphenTrans)); - } - dict->states[state1].trans[num_trans].ch = ch; - dict->states[state1].trans[num_trans].new_state = state2; - dict->states[state1].num_trans++; -} - -#ifdef VERBOSE -HashTab *global; - -static char * -get_state_str (int state) -{ - int i; - HashEntry *e; - - for (i = 0; i < HASH_SIZE; i++) - for (e = global->entries[i]; e; e = e->next) - if (e->val == state) - return e->key; - return NULL; -} -#endif - -HyphenDict * -hnj_hyphen_load (const char *fn) -{ - HyphenDict *result; - FILE *f; - f = fopen (fn, "r"); - if (f == NULL) - return NULL; - - result = hnj_hyphen_load_file(f); - - fclose(f); - return result; -} - -HyphenDict * -hnj_hyphen_load_file (FILE *f) -{ - HyphenDict *dict[2]; - HashTab *hashtab; - char buf[MAX_CHARS]; - char word[MAX_CHARS]; - char pattern[MAX_CHARS]; - char * repl; - signed char replindex; - signed char replcut; - int state_num = 0, last_state; - int i, j, k; - char ch; - int found; - HashEntry *e; - int nextlevel = 0; - -// loading one or two dictionaries (separated by NEXTLEVEL keyword) -for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { - hashtab = hnj_hash_new (); -#ifdef VERBOSE - global = hashtab; -#endif - hnj_hash_insert (hashtab, "", 0); - dict[k] = hnj_malloc (sizeof(HyphenDict)); - dict[k]->num_states = 1; - dict[k]->states = hnj_malloc (sizeof(HyphenState)); - dict[k]->states[0].match = NULL; - dict[k]->states[0].repl = NULL; - dict[k]->states[0].fallback_state = -1; - dict[k]->states[0].num_trans = 0; - dict[k]->states[0].trans = NULL; - dict[k]->nextlevel = NULL; - dict[k]->lhmin = 0; - dict[k]->rhmin = 0; - dict[k]->clhmin = 0; - dict[k]->crhmin = 0; - - /* read in character set info */ - if (k == 0) { - for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; - if (fgets(dict[k]->cset, sizeof(dict[k]->cset),f) != NULL) { - for (i=0;i<MAX_NAME;i++) - if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) - dict[k]->cset[i] = 0; - } else { - dict[k]->cset[0] = 0; - } - dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); - } else { - strcpy(dict[k]->cset, dict[0]->cset); - dict[k]->utf8 = dict[0]->utf8; - } - - while (fgets (buf, sizeof(buf), f) != NULL) - { - if (buf[0] != '%') - { - if (strncmp(buf, "NEXTLEVEL", 9) == 0) { - nextlevel = 1; - break; - } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { - dict[k]->lhmin = atoi(buf + 13); - continue; - } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { - dict[k]->rhmin = atoi(buf + 14); - continue; - } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { - dict[k]->clhmin = atoi(buf + 21); - continue; - } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { - dict[k]->crhmin = atoi(buf + 22); - continue; - } - j = 0; - pattern[j] = '0'; - repl = strchr(buf, '/'); - replindex = 0; - replcut = 0; - if (repl) { - char * index = strchr(repl + 1, ','); - *repl = '\0'; - if (index) { - char * index2 = strchr(index + 1, ','); - *index = '\0'; - if (index2) { - *index2 = '\0'; - replindex = (signed char) atoi(index + 1) - 1; - replcut = (signed char) atoi(index2 + 1); - } - } else { - hnj_strchomp(repl + 1); - replindex = 0; - replcut = (signed char) strlen(buf); - } - repl = hnj_strdup(repl + 1); - } - for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) - { - if (buf[i] >= '0' && buf[i] <= '9') - pattern[j] = buf[i]; - else - { - word[j] = buf[i]; - pattern[++j] = '0'; - } - } - word[j] = '\0'; - pattern[j + 1] = '\0'; - - i = 0; - if (!repl) { - /* Optimize away leading zeroes */ - for (; pattern[i] == '0'; i++); - } else { - if (*word == '.') i++; - /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ - if (dict[k]->utf8) { - int pu = -1; /* unicode character position */ - int ps = -1; /* unicode start position (original replindex) */ - int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ - for (; pc < (strlen(word) + 1); pc++) { - /* beginning of an UTF-8 character (not '10' start bits) */ - if ((((unsigned char) word[pc]) >> 6) != 2) pu++; - if ((ps < 0) && (replindex == pu)) { - ps = replindex; - replindex = (signed char) pc; - } - if ((ps >= 0) && ((pu - ps) == replcut)) { - replcut = (signed char) (pc - replindex); - break; - } - } - if (*word == '.') replindex--; - } - } - -#ifdef VERBOSE - printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); -#endif - found = hnj_hash_lookup (hashtab, word); - state_num = hnj_get_state (dict[k], hashtab, word); - dict[k]->states[state_num].match = hnj_strdup (pattern + i); - dict[k]->states[state_num].repl = repl; - dict[k]->states[state_num].replindex = replindex; - if (!replcut) { - dict[k]->states[state_num].replcut = (signed char) strlen(word); - } else { - dict[k]->states[state_num].replcut = replcut; - } - - /* now, put in the prefix transitions */ - for (; found < 0 ;j--) - { - last_state = state_num; - ch = word[j - 1]; - word[j - 1] = '\0'; - found = hnj_hash_lookup (hashtab, word); - state_num = hnj_get_state (dict[k], hashtab, word); - hnj_add_trans (dict[k], state_num, last_state, ch); - } - } - } - - /* Could do unioning of matches here (instead of the preprocessor script). - If we did, the pseudocode would look something like this: - - foreach state in the hash table - foreach i = [1..length(state) - 1] - state to check is substr (state, i) - look it up - if found, and if there is a match, union the match in. - - It's also possible to avoid the quadratic blowup by doing the - search in order of increasing state string sizes - then you - can break the loop after finding the first match. - - This step should be optional in any case - if there is a - preprocessed rule table, it's always faster to use that. - -*/ - - /* put in the fallback states */ - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = e->next) - { - if (*(e->key)) for (j = 1; 1; j++) - { - state_num = hnj_hash_lookup (hashtab, e->key + j); - if (state_num >= 0) - break; - } - /* KBH: FIXME state 0 fallback_state should always be -1? */ - if (e->val) - dict[k]->states[e->val].fallback_state = state_num; - } -#ifdef VERBOSE - for (i = 0; i < HASH_SIZE; i++) - for (e = hashtab->entries[i]; e; e = e->next) - { - printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, - dict[k]->states[e->val].fallback_state); - for (j = 0; j < dict[k]->states[e->val].num_trans; j++) - printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, - dict[k]->states[e->val].trans[j].new_state); - } -#endif - -#ifndef VERBOSE - hnj_hash_free (hashtab); -#endif - state_num = 0; -} - if (k == 2) dict[0]->nextlevel = dict[1]; - return dict[0]; -} - -void hnj_hyphen_free (HyphenDict *dict) -{ - int state_num; - HyphenState *hstate; - - for (state_num = 0; state_num < dict->num_states; state_num++) - { - hstate = &dict->states[state_num]; - if (hstate->match) - hnj_free (hstate->match); - if (hstate->repl) - hnj_free (hstate->repl); - if (hstate->trans) - hnj_free (hstate->trans); - } - if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); - - hnj_free (dict->states); - - hnj_free (dict); -} - -#define MAX_WORD 256 - -int hnj_hyphen_hyphenate (HyphenDict *dict, - const char *word, int word_size, - char *hyphens) -{ - char prep_word_buf[MAX_WORD]; - char *prep_word; - int i, j, k; - int state; - char ch; - HyphenState *hstate; - char *match; - int offset; - - if (word_size + 3 < MAX_WORD) - prep_word = prep_word_buf; - else - prep_word = hnj_malloc (word_size + 3); - - j = 0; - prep_word[j++] = '.'; - - for (i = 0; i < word_size; i++) - prep_word[j++] = word[i]; - - prep_word[j++] = '.'; - prep_word[j] = '\0'; - - for (i = 0; i < word_size + 5; i++) - hyphens[i] = '0'; - -#ifdef VERBOSE - printf ("prep_word = %s\n", prep_word); -#endif - - /* now, run the finite state machine */ - state = 0; - for (i = 0; i < j; i++) - { - ch = prep_word[i]; - for (;;) - { - - if (state == -1) { - /* return 1; */ - /* KBH: FIXME shouldn't this be as follows? */ - state = 0; - goto try_next_letter; - } - -#ifdef VERBOSE - char *state_str; - state_str = get_state_str (state); - - for (k = 0; k < i - strlen (state_str); k++) - putchar (' '); - printf ("%s", state_str); -#endif - - hstate = &dict->states[state]; - for (k = 0; k < hstate->num_trans; k++) - if (hstate->trans[k].ch == ch) - { - state = hstate->trans[k].new_state; - goto found_state; - } - state = hstate->fallback_state; -#ifdef VERBOSE - printf (" falling back, fallback_state %d\n", state); -#endif - } - found_state: -#ifdef VERBOSE - printf ("found state %d\n",state); -#endif - /* Additional optimization is possible here - especially, - elimination of trailing zeroes from the match. Leading zeroes - have already been optimized. */ - match = dict->states[state].match; - /* replacing rules not handled by hyphen_hyphenate() */ - if (match && !dict->states[state].repl) - { - offset = i + 1 - strlen (match); -#ifdef VERBOSE - for (k = 0; k < offset; k++) - putchar (' '); - printf ("%s\n", match); -#endif - /* This is a linear search because I tried a binary search and - found it to be just a teeny bit slower. */ - for (k = 0; match[k]; k++) - if (hyphens[offset + k] < match[k]) - hyphens[offset + k] = match[k]; - } - - /* KBH: we need this to make sure we keep looking in a word */ - /* for patterns even if the current character is not known in state 0 */ - /* since patterns for hyphenation may occur anywhere in the word */ - try_next_letter: ; - - } -#ifdef VERBOSE - for (i = 0; i < j; i++) - putchar (hyphens[i]); - putchar ('\n'); -#endif - - for (i = 0; i < j - 4; i++) -#if 0 - if (hyphens[i + 1] & 1) - hyphens[i] = '-'; -#else - hyphens[i] = hyphens[i + 1]; -#endif - hyphens[0] = '0'; - for (; i < word_size; i++) - hyphens[i] = '0'; - hyphens[word_size] = '\0'; - - if (prep_word != prep_word_buf) - hnj_free (prep_word); - - return 0; -} - -/* Unicode ligature length */ -int hnj_ligature(unsigned char c) { - switch (c) { - case 0x80: /* ff */ - case 0x81: /* fi */ - case 0x82: return LIG_xx; /* fl */ - case 0x83: /* ffi */ - case 0x84: return LIG_xxx; /* ffl */ - case 0x85: /* long st */ - case 0x86: return LIG_xx; /* st */ - } - return 0; -} - -/* character length of the first n byte of the input word */ -int hnj_hyphen_strnlen(const char * word, int n, int utf8) -{ - int i = 0; - int j = 0; - while (j < n && word[j] != '\0') { - i++; - // Unicode ligature support - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { - i += hnj_ligature(word[j + 2]); - } - for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); - } - return i; -} - -int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut, int lhmin) -{ - int i = 1, j; - - // Unicode ligature support - if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC)) { - i += hnj_ligature(word[2]); - } - - for (j = 0; i < lhmin && word[j] != '\0'; i++) do { - // check length of the non-standard part - if (*rep && *pos && *cut && (*rep)[j]) { - char * rh = strchr((*rep)[j], '='); - if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + - hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { - free((*rep)[j]); - (*rep)[j] = NULL; - hyphens[j] = '0'; - } - } else { - hyphens[j] = '0'; - } - j++; - - // Unicode ligature support - if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC)) { - i += hnj_ligature(word[j + 2]); - } - } while (utf8 && (word[j] & 0xc0) == 0x80); - return 0; -} - -int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut, int rhmin) -{ - int i; - int j = word_size - 2; - for (i = 1; i < rhmin && j > 0; j--) { - // check length of the non-standard part - if (*rep && *pos && *cut && (*rep)[j]) { - char * rh = strchr((*rep)[j], '='); - if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + - hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { - free((*rep)[j]); - (*rep)[j] = NULL; - hyphens[j] = '0'; - } - } else { - hyphens[j] = '0'; - } - if (!utf8 || (word[j] & 0xc0) != 0xc0) i++; - } - return 0; -} - -// recursive function for compound level hyphenation -int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, - char * hyphens, char *** rep, int ** pos, int ** cut, - int clhmin, int crhmin, int lend, int rend) -{ - char prep_word_buf[MAX_WORD]; - char *prep_word; - int i, j, k; - int state; - char ch; - HyphenState *hstate; - char *match; - char *repl; - signed char replindex; - signed char replcut; - int offset; - int matchlen_buf[MAX_CHARS]; - int matchindex_buf[MAX_CHARS]; - char * matchrepl_buf[MAX_CHARS]; - int * matchlen; - int * matchindex; - char ** matchrepl; - int isrepl = 0; - int nHyphCount; - - if (word_size + 3 < MAX_CHARS) { - prep_word = prep_word_buf; - matchlen = matchlen_buf; - matchindex = matchindex_buf; - matchrepl = matchrepl_buf; - } else { - prep_word = hnj_malloc (word_size + 3); - matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); - matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); - matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); - } - - j = 0; - prep_word[j++] = '.'; - - for (i = 0; i < word_size; i++) - prep_word[j++] = word[i]; - - prep_word[j++] = '.'; - prep_word[j] = '\0'; - - for (i = 0; i < j; i++) - hyphens[i] = '0'; - -#ifdef VERBOSE - printf ("prep_word = %s\n", prep_word); -#endif - - /* now, run the finite state machine */ - state = 0; - for (i = 0; i < j; i++) - { - ch = prep_word[i]; - for (;;) - { - - if (state == -1) { - /* return 1; */ - /* KBH: FIXME shouldn't this be as follows? */ - state = 0; - goto try_next_letter; - } - -#ifdef VERBOSE - char *state_str; - state_str = get_state_str (state); - - for (k = 0; k < i - strlen (state_str); k++) - putchar (' '); - printf ("%s", state_str); -#endif - - hstate = &dict->states[state]; - for (k = 0; k < hstate->num_trans; k++) - if (hstate->trans[k].ch == ch) - { - state = hstate->trans[k].new_state; - goto found_state; - } - state = hstate->fallback_state; -#ifdef VERBOSE - printf (" falling back, fallback_state %d\n", state); -#endif - } - found_state: -#ifdef VERBOSE - printf ("found state %d\n",state); -#endif - /* Additional optimization is possible here - especially, - elimination of trailing zeroes from the match. Leading zeroes - have already been optimized. */ - match = dict->states[state].match; - repl = dict->states[state].repl; - replindex = dict->states[state].replindex; - replcut = dict->states[state].replcut; - /* replacing rules not handled by hyphen_hyphenate() */ - if (match) - { - offset = i + 1 - strlen (match); -#ifdef VERBOSE - for (k = 0; k < offset; k++) - putchar (' '); - printf ("%s (%s)\n", match, repl); -#endif - if (repl) { - if (!isrepl) for(; isrepl < word_size; isrepl++) { - matchrepl[isrepl] = NULL; - matchindex[isrepl] = -1; - } - matchlen[offset + replindex] = replcut; - } - /* This is a linear search because I tried a binary search and - found it to be just a teeny bit slower. */ - for (k = 0; match[k]; k++) { - if ((hyphens[offset + k] < match[k])) { - hyphens[offset + k] = match[k]; - if (match[k]&1) { - matchrepl[offset + k] = repl; - if (repl && (k >= replindex) && (k <= replindex + replcut)) { - matchindex[offset + replindex] = offset + k; - } - } - } - } - - } - - /* KBH: we need this to make sure we keep looking in a word */ - /* for patterns even if the current character is not known in state 0 */ - /* since patterns for hyphenation may occur anywhere in the word */ - try_next_letter: ; - - } -#ifdef VERBOSE - for (i = 0; i < j; i++) - putchar (hyphens[i]); - putchar ('\n'); -#endif - - for (i = 0; i < j - 3; i++) -#if 0 - if (hyphens[i + 1] & 1) - hyphens[i] = '-'; -#else - hyphens[i] = hyphens[i + 1]; -#endif - for (; i < word_size; i++) - hyphens[i] = '0'; - hyphens[word_size] = '\0'; - - /* now create a new char string showing hyphenation positions */ - /* count the hyphens and allocate space for the new hyphenated string */ - nHyphCount = 0; - for (i = 0; i < word_size; i++) - if (hyphens[i]&1) - nHyphCount++; - j = 0; - for (i = 0; i < word_size; i++) { - if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { - if (rep && pos && cut) { - if (!*rep && !*pos && !*cut) { - int k; - *rep = (char **) malloc(sizeof(char *) * word_size); - *pos = (int *) malloc(sizeof(int) * word_size); - *cut = (int *) malloc(sizeof(int) * word_size); - for (k = 0; k < word_size; k++) { - (*rep)[k] = NULL; - (*pos)[k] = 0; - (*cut)[k] = 0; - } - } - (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); - (*pos)[matchindex[i] - 1] = matchindex[i] - i; - (*cut)[matchindex[i] - 1] = matchlen[i]; - } - j += strlen(matchrepl[matchindex[i]]); - i += matchlen[i] - 1; - } - } - - if (matchrepl != matchrepl_buf) { - hnj_free (matchrepl); - hnj_free (matchlen); - hnj_free (matchindex); - } - - // recursive hyphenation of the first (compound) level segments - if (dict->nextlevel) { - char * rep2_buf[MAX_WORD]; - int pos2_buf[MAX_WORD]; - int cut2_buf[MAX_WORD]; - char hyphens2_buf[MAX_WORD]; - char ** rep2; - int * pos2; - int * cut2; - char * hyphens2; - int begin = 0; - if (word_size < MAX_CHARS) { - rep2 = rep2_buf; - pos2 = pos2_buf; - cut2 = cut2_buf; - hyphens2 = hyphens2_buf; - } else { - rep2 = hnj_malloc (word_size * sizeof(char *)); - pos2 = hnj_malloc (word_size * sizeof(int)); - cut2 = hnj_malloc (word_size * sizeof(int)); - hyphens2 = hnj_malloc (word_size); - } - for (i = 0; i < word_size; i++) rep2[i] = NULL; - for (i = 0; i < word_size; i++) if - (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { - if (i - begin > 1) { - int hyph = 0; - prep_word[i + 2] = '\0'; - /* non-standard hyphenation at compound boundary (Schiffahrt) */ - if (*rep && *pos && *cut && (*rep)[i]) { - char * l = strchr((*rep)[i], '='); - strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]); - if (l) { - hyph = (l - (*rep)[i]) - (*pos)[i]; - prep_word[2 + i + hyph] = '\0'; - } - } - hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, - hyphens2, &rep2, &pos2, &cut2, clhmin, - crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); - for (j = 0; j < i - begin - 1; j++) { - hyphens[begin + j] = hyphens2[j]; - if (rep2[j] && rep && pos && cut) { - if (!*rep && !*pos && !*cut) { - int k; - *rep = (char **) malloc(sizeof(char *) * word_size); - *pos = (int *) malloc(sizeof(int) * word_size); - *cut = (int *) malloc(sizeof(int) * word_size); - for (k = 0; k < word_size; k++) { - (*rep)[k] = NULL; - (*pos)[k] = 0; - (*cut)[k] = 0; - } - } - (*rep)[begin + j] = rep2[j]; - (*pos)[begin + j] = pos2[j]; - (*cut)[begin + j] = cut2[j]; - } - } - prep_word[i + 2] = word[i + 1]; - if (*rep && *pos && *cut && (*rep)[i]) { - strcpy(prep_word + 1, word); - } - } - begin = i + 1; - for (j = 0; j < word_size; j++) rep2[j] = NULL; - } - - // non-compound - if (begin == 0) { - hnj_hyphen_hyph_(dict->nextlevel, word, word_size, - hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); - if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, clhmin); - if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, crhmin); - } - - if (rep2 != rep2_buf) { - free(rep2); - free(cut2); - free(pos2); - free(hyphens2); - } - } - - if (prep_word != prep_word_buf) hnj_free (prep_word); - return 0; -} - -/* UTF-8 normalization of hyphen and non-standard positions */ -int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, - char *** rep, int ** pos, int ** cut) -{ - int i, j, k; - if ((((unsigned char) word[0]) >> 6) == 2) { - fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); - return 1; - } - - /* calculate UTF-8 character positions */ - for (i = 0, j = -1; i < word_size; i++) { - /* beginning of an UTF-8 character (not '10' start bits) */ - if ((((unsigned char) word[i]) >> 6) != 2) j++; - hyphens[j] = hyphens[i]; - if (rep && pos && cut && *rep && *pos && *cut) { - int l = (*pos)[i]; - (*pos)[j] = 0; - for (k = 0; k < l; k++) { - if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; - } - k = i - l + 1; - l = k + (*cut)[i]; - (*cut)[j] = 0; - for (; k < l; k++) { - if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; - } - (*rep)[j] = (*rep)[i]; - if (j < i) { - (*rep)[i] = NULL; - (*pos)[i] = 0; - (*cut)[i] = 0; - } - } - } - hyphens[j + 1] = '\0'; - return 0; -} - -/* get the word with all possible hyphenations (output: hyphword) */ -void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, - char * hyphword, char *** rep, int ** pos, int ** cut) -{ - int i, j; - for (i = 0, j = 0; i < l; i++, j++) { - if (hyphens[i]&1) { - hyphword[j] = word[i]; - if (*rep && *pos && *cut && (*rep)[i]) { - strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]); - j += strlen((*rep)[i]) - (*pos)[i]; - i += (*cut)[i] - (*pos)[i]; - } else hyphword[++j] = '='; - } else hyphword[j] = word[i]; - } - hyphword[j] = '\0'; -} - - -/* main api function with default hyphenmin parameters */ -int hnj_hyphen_hyphenate2 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphword, char *** rep, int ** pos, int ** cut) -{ - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, - dict->clhmin, dict->crhmin, 1, 1); - hnj_hyphen_lhmin(dict->utf8, word, word_size, - hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); - hnj_hyphen_rhmin(dict->utf8, word, word_size, - hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); - return 0; -} - -/* previous main api function with hyphenmin parameters */ -int hnj_hyphen_hyphenate3 (HyphenDict *dict, - const char *word, int word_size, char * hyphens, - char *hyphword, char *** rep, int ** pos, int ** cut, - int lhmin, int rhmin, int clhmin, int crhmin) -{ - lhmin = (lhmin > 0 ? lhmin : dict->lhmin); - rhmin = (rhmin > 0 ? rhmin : dict->rhmin); - hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, - clhmin, crhmin, 1, 1); - hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, (lhmin > 0 ? lhmin : 2)); - hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, - rep, pos, cut, (rhmin > 0 ? rhmin : 2)); - if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); - if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); - return 0; -} |