diff options
Diffstat (limited to 'gettext-tools/src/sentence.c')
-rw-r--r-- | gettext-tools/src/sentence.c | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/gettext-tools/src/sentence.c b/gettext-tools/src/sentence.c new file mode 100644 index 0000000..a5ae35e --- /dev/null +++ b/gettext-tools/src/sentence.c @@ -0,0 +1,194 @@ +/* Sentence handling. + Copyright (C) 2015 Free Software Foundation, Inc. + Written by Daiki Ueno <ueno@gnu.org>, 2015. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +/* Specification. */ +#include "sentence.h" + +#include <stdlib.h> +#include <string.h> +#include "unistr.h" + + +/* The minimal number of white spaces which should follow after the + end of sentence. */ +int sentence_end_required_spaces = 1; + +/* This function works in a similar way to 'forward-sentence' in + Emacs, which basically does a regular expression matching of: + + [.?!\u2026] + []"'\u201d)}]* + \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\) + + Since we are lacking a regular expression routine capable of + Unicode (though gnulib-lib/lib/regex.c provides locale-dependent + version, we would rather avoid depending on wchar_t), apply a + manually constructed DFA, which consists of 8 states where 4 of + them are a terminal. */ +const char * +sentence_end (const char *string, ucs4_t *ending_charp) +{ + const char *str = string; + const char *str_limit = string + strlen (str); + /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal. */ + int state = 0; + /* Previous character before an end marker. */ + ucs4_t ending_char = 0xfffd; + /* Possible starting position of the match, and the next starting + position if the current match fails. */ + const char *match_start, *match_next; + /* Number of spaces. */ + int spaces; + + while (str <= str_limit) + { + ucs4_t uc; + size_t length; + + length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); + + if (state == 0) + { + switch (uc) + { + case '.': case '?': case '!': case 0x2026: + state = 1; + match_start = str; + match_next = str + length; + ending_char = uc; + spaces = 0; + break; + + default: + break; + } + + str += length; + continue; + } + + if (state == 1) + { + switch (uc) + { + case ']': case '"': case '\'': case ')': case '}': case 0x201d: + state = 2; + break; + + case '\0': case '\n': + /* State 3. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + state = 4; + break; + + case '\t': + /* State 5. */ + *ending_charp = ending_char; + return match_start; + + default: + str = match_next; + state = 0; + continue; + } + + str += length; + continue; + } + + if (state == 2) + { + switch (uc) + { + case ']': case '"': case '\'': case ')': case '}': case 0x201d: + break; + + case '\0': case '\n': + /* State 3. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + state = 4; + break; + + case '\t': + /* State 5. */ + *ending_charp = ending_char; + return match_start; + + default: + state = 0; + str = match_next; + continue; + } + + str += length; + continue; + } + + if (state == 4) + { + switch (uc) + { + case '\0': case '\n': + /* State 6. */ + *ending_charp = ending_char; + return match_start; + + case ' ': case 0x00a0: + if (++spaces == sentence_end_required_spaces) + { + /* State 7. */ + *ending_charp = ending_char; + return match_start; + } + break; + + default: + state = 0; + str = match_next; + continue; + } + + str += length; + continue; + } + } + + *ending_charp = 0xfffd; + return str_limit; +} |