diff options
author | Daiki Ueno <ueno@gnu.org> | 2015-12-09 17:35:34 +0900 |
---|---|---|
committer | Daiki Ueno <ueno@gnu.org> | 2015-12-09 19:07:06 +0900 |
commit | 898e184a596c43abf1067089a03df3e79b4e4527 (patch) | |
tree | e9f5596bb75f8a0ba47f9b34d26346f53d981613 /gnulib-local | |
parent | f6dde6baeef8e6cb5ec92bc6c67c5c0304ba4396 (diff) | |
download | external_gettext-898e184a596c43abf1067089a03df3e79b4e4527.zip external_gettext-898e184a596c43abf1067089a03df3e79b4e4527.tar.gz external_gettext-898e184a596c43abf1067089a03df3e79b4e4527.tar.bz2 |
build: Remove expat dependency
* DEPENDENCIES: Suggest libxml2 instead of expat.
* gnulib-local/lib/markup.c: New file.
* gnulib-local/lib/markup.h: New file.
* gnulib-local/modules/markup: New file.
* autogen.sh (GNULIB_MODULES_LIBGETTEXTPO): Add markup module.
* gettext-tools/configure.ac: Remove checks for expat.
* gettext-tools/gnulib-lib/.gitignore: Ignore modules pulled by
gnulib-tool due to the markup module usage.
* gettext-tools/gnulib-tests/.gitignore: Likewise.
* gettext-tools/libgettextpo/.gitignore: Likewise.
* gettext-tools/libgettextpo/Makefile.am (libgettextpo_la_AUXSOURCES):
Remove ../src/libexpat-compat.c.
(libgettextpo_la_LDFLAGS): Remove @LTLIBEXPAT@.
* gettext-tools/src/Makefile.am (noinst_HEADERS): Remove
libexpat-compat.h.
(libgettextsrc_la_SOURCES): Remove libexpat-compat.c.
(libgettextsrc_la_LDFLAGS): Remove @LTLIBEXPAT@.
* gettext-tools/src/format-kde-kuit.c: Use functions from markup.h, when
the file is being compiled as part of libgettextpo. Otherwise use
libxml2.
* gettext-tools/src/libexpat-compat.c: Remove.
* gettext-tools/src/libexpat-compat.h: Remove.
Diffstat (limited to 'gnulib-local')
-rw-r--r-- | gnulib-local/lib/markup.c | 1523 | ||||
-rw-r--r-- | gnulib-local/lib/markup.h | 164 | ||||
-rw-r--r-- | gnulib-local/modules/markup | 31 |
3 files changed, 1718 insertions, 0 deletions
diff --git a/gnulib-local/lib/markup.c b/gnulib-local/lib/markup.c new file mode 100644 index 0000000..a0f6856 --- /dev/null +++ b/gnulib-local/lib/markup.c @@ -0,0 +1,1523 @@ +/* markup.c -- simple XML-like parser + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is not part of the GNU gettext program, but is used with + GNU gettext. + + This is a stripped down version of GLib's gmarkup.c. The original + copyright notice is as follows: +*/ + +/* gmarkup.c - Simple XML-like parser + * + * Copyright 2000, 2003 Red Hat, Inc. + * Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca> + * + * GLib is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 3 of the + * License, or (at your option) any later version. + * + * GLib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with GLib; see the file COPYING.LIB. If not, + * see <http://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#include <assert.h> +#include <stdarg.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> + +/* Specification */ +#include "markup.h" + +#include "c-ctype.h" +#include "gettext.h" +#include "gl_linked_list.h" +#include "gl_xlist.h" +#include "unictype.h" +#include "unistr.h" +#include "xalloc.h" +#include "xvasprintf.h" + +#define _(s) gettext(s) + +/** + * The "markup" parser is intended to parse a simple markup format + * that's a subset of XML. This is a small, efficient, easy-to-use + * parser. It should not be used if you expect to interoperate with + * other applications generating full-scale XML. However, it's very + * useful for application data files, config files, etc. where you + * know your application will be the only one writing the file. + * Full-scale XML parsers should be able to parse the subset used by + * markup, so you can easily migrate to full-scale XML at a later + * time if the need arises. + * + * The parser is not guaranteed to signal an error on all invalid XML; + * the parser may accept documents that an XML parser would not. + * However, XML documents which are not well-formed (which is a weaker + * condition than being valid. See the XML specification + * <http://www.w3.org/TR/REC-xml/> for definitions of these terms.) + * are not considered valid GMarkup documents. + * + * Simplifications to XML include: + * + * - Only UTF-8 encoding is allowed + * + * - No user-defined entities + * + * - Processing instructions, comments and the doctype declaration + * are "passed through" but are not interpreted in any way + * + * - No DTD or validation + * + * The markup format does support: + * + * - Elements + * + * - Attributes + * + * - 5 standard entities: & < > " ' + * + * - Character references + * + * - Sections marked as CDATA + */ + +typedef enum +{ + STATE_START, + STATE_AFTER_OPEN_ANGLE, + STATE_AFTER_CLOSE_ANGLE, + STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */ + STATE_INSIDE_OPEN_TAG_NAME, + STATE_INSIDE_ATTRIBUTE_NAME, + STATE_AFTER_ATTRIBUTE_NAME, + STATE_BETWEEN_ATTRIBUTES, + STATE_AFTER_ATTRIBUTE_EQUALS_SIGN, + STATE_INSIDE_ATTRIBUTE_VALUE_SQ, + STATE_INSIDE_ATTRIBUTE_VALUE_DQ, + STATE_INSIDE_TEXT, + STATE_AFTER_CLOSE_TAG_SLASH, + STATE_INSIDE_CLOSE_TAG_NAME, + STATE_AFTER_CLOSE_TAG_NAME, + STATE_INSIDE_PASSTHROUGH, + STATE_ERROR +} markup_parse_state_ty; + +typedef struct +{ + const char *prev_element; + const markup_parser_ty *prev_parser; + void *prev_user_data; +} markup_recursion_tracker_ty; + +typedef struct +{ + char *buffer; + size_t bufmax; + size_t buflen; +} markup_string_ty; + +struct _markup_parse_context_ty +{ + const markup_parser_ty *parser; + + markup_parse_flags_ty flags; + + int line_number; + int char_number; + + markup_parse_state_ty state; + + void *user_data; + + /* A piece of character data or an element that + * hasn't "ended" yet so we haven't yet called + * the callback for it. + */ + markup_string_ty *partial_chunk; + + gl_list_t tag_stack; /* <markup_string_ty> */ + + char **attr_names; + char **attr_values; + int cur_attr; + int alloc_attrs; + + const char *current_text; + ssize_t current_text_len; + const char *current_text_end; + + /* used to save the start of the last interesting thingy */ + const char *start; + + const char *iter; + + char *error_text; + + unsigned int document_empty : 1; + unsigned int parsing : 1; + unsigned int awaiting_pop : 1; + int balance; + + /* subparser support */ + gl_list_t subparser_stack; /* <markup_recursion_tracker_ty *> */ + const char *subparser_element; +}; + +static markup_string_ty * +markup_string_new (void) +{ + return XZALLOC (markup_string_ty); +} + +static char * +markup_string_free (markup_string_ty *string, bool free_segment) +{ + if (free_segment) + { + free (string->buffer); + free (string); + return NULL; + } + else + { + char *result = string->buffer; + free (string); + return result; + } +} + +static void +markup_string_free1 (markup_string_ty *string) +{ + markup_string_free (string, true); +} + +static void +markup_string_truncate (markup_string_ty *string, size_t length) +{ + assert (string && length < string->buflen - 1); + string->buffer[length] = '\0'; + string->buflen = length; +} + +static void +markup_string_append (markup_string_ty *string, const char *to_append, + size_t length) +{ + if (string->buflen + length + 1 > string->bufmax) + { + string->bufmax *= 2; + if (string->buflen + length + 1 > string->bufmax) + string->bufmax = string->buflen + length + 1; + string->buffer = xrealloc (string->buffer, string->bufmax); + } + memcpy (string->buffer + string->buflen, to_append, length); + string->buffer[length] = '\0'; + string->buflen = length; +} + +static inline void +string_blank (markup_string_ty *string) +{ + if (string->bufmax > 0) + { + *string->buffer = '\0'; + string->buflen = 0; + } +} + +/* Creates a new parse context. A parse context is used to parse + marked-up documents. You can feed any number of documents into a + context, as long as no errors occur; once an error occurs, the + parse context can't continue to parse text (you have to free it and + create a new parse context). */ +markup_parse_context_ty * +markup_parse_context_new (const markup_parser_ty *parser, + markup_parse_flags_ty flags, + void *user_data) +{ + markup_parse_context_ty *context; + + assert (parser != NULL); + + context = XMALLOC (markup_parse_context_ty); + + context->parser = parser; + context->flags = flags; + context->user_data = user_data; + + context->line_number = 1; + context->char_number = 1; + + context->partial_chunk = NULL; + + context->state = STATE_START; + context->tag_stack = + gl_list_create_empty (GL_LINKED_LIST, + NULL, NULL, + (gl_listelement_dispose_fn) markup_string_free1, + true); + context->attr_names = NULL; + context->attr_values = NULL; + context->cur_attr = -1; + context->alloc_attrs = 0; + + context->current_text = NULL; + context->current_text_len = -1; + context->current_text_end = NULL; + + context->start = NULL; + context->iter = NULL; + + context->error_text = NULL; + + context->document_empty = true; + context->parsing = false; + + context->awaiting_pop = false; + context->subparser_stack = + gl_list_create_empty (GL_LINKED_LIST, + NULL, NULL, + (gl_listelement_dispose_fn) free, + true); + context->subparser_element = NULL; + + context->balance = 0; + + return context; +} + +static void clear_attributes (markup_parse_context_ty *context); + +/* Frees a parse context. This function can't be called from inside + one of the markup_parser_ty functions or while a subparser is + pushed. */ +void +markup_parse_context_free (markup_parse_context_ty *context) +{ + assert (context != NULL); + assert (!context->parsing); + assert (gl_list_size (context->subparser_stack) == 0); + assert (!context->awaiting_pop); + + clear_attributes (context); + free (context->attr_names); + free (context->attr_values); + + gl_list_free (context->tag_stack); + gl_list_free (context->subparser_stack); + + if (context->partial_chunk) + markup_string_free (context->partial_chunk, true); + + free (context->error_text); + + free (context); +} + +static void pop_subparser_stack (markup_parse_context_ty *context); + +static void +emit_error (markup_parse_context_ty *context, const char *error_text) +{ + context->state = STATE_ERROR; + + if (context->parser->error) + (*context->parser->error) (context, error_text, context->user_data); + + /* report the error all the way up to free all the user-data */ + while (gl_list_size (context->subparser_stack) > 0) + { + pop_subparser_stack (context); + context->awaiting_pop = false; /* already been freed */ + + if (context->parser->error) + (*context->parser->error) (context, error_text, context->user_data); + } + + if (context->error_text) + free (context->error_text); + context->error_text = xstrdup (error_text); +} + +#define IS_COMMON_NAME_END_CHAR(c) \ + ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ') + +static bool +slow_name_validate (markup_parse_context_ty *context, const char *name) +{ + const char *p = name; + ucs4_t uc; + + if (u8_check ((uint8_t *) name, strlen (name)) != NULL) + { + emit_error (context, _("invalid UTF-8 sequence")); + return false; + } + + if (!(c_isalpha (*p) + || (!IS_COMMON_NAME_END_CHAR (*p) + && (*p == '_' + || *p == ':' + || (u8_mbtouc (&uc, (uint8_t *) name, strlen (name)) > 0 + && uc_is_alpha (uc)))))) + { + char *error_text = xasprintf (_("'%s' is not a valid name"), name); + emit_error (context, error_text); + free (error_text); + return false; + } + + for (p = (char *) u8_next (&uc, (uint8_t *) name); + p != NULL; + p = (char *) u8_next (&uc, (uint8_t *) p)) + { + /* is_name_char */ + if (!(c_isalnum (*p) || + (!IS_COMMON_NAME_END_CHAR (*p) && + (*p == '.' || + *p == '-' || + *p == '_' || + *p == ':' || + uc_is_alpha (uc))))) + { + char *error_text = xasprintf (_("'%s' is not a valid name: '%c'"), + name, *p); + emit_error (context, error_text); + free (error_text); + return false; + } + } + return true; +} + +/* + * Use me for elements, attributes etc. + */ +static bool +name_validate (markup_parse_context_ty *context, const char *name) +{ + char mask; + const char *p; + + /* name start char */ + p = name; + if (IS_COMMON_NAME_END_CHAR (*p) + || !(c_isalpha (*p) || *p == '_' || *p == ':')) + goto slow_validate; + + for (mask = *p++; *p != '\0'; p++) + { + mask |= *p; + + /* is_name_char */ + if (!(c_isalnum (*p) + || (!IS_COMMON_NAME_END_CHAR (*p) + && (*p == '.' || *p == '-' || *p == '_' || *p == ':')))) + goto slow_validate; + } + + if (mask & 0x80) /* un-common / non-ascii */ + goto slow_validate; + + return true; + + slow_validate: + return slow_name_validate (context, name); +} + +static bool +text_validate (markup_parse_context_ty *context, + const char *p, + int len) +{ + if (u8_check ((const uint8_t *) p, len) != NULL) + { + emit_error (context, _("invalid UTF-8 sequence")); + return false; + } + else + return true; +} + +/* + * re-write the GString in-place, unescaping anything that escaped. + * most XML does not contain entities, or escaping. + */ +static bool +unescape_string_inplace (markup_parse_context_ty *context, + markup_string_ty *string, + bool *is_ascii) +{ + char mask, *to; + const char *from; + bool normalize_attribute; + + if (string->buflen == 0) + return true; + + *is_ascii = false; + + /* are we unescaping an attribute or not ? */ + if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ + || context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ) + normalize_attribute = true; + else + normalize_attribute = false; + + /* + * Meeks' theorem: unescaping can only shrink text. + * for < etc. this is obvious, for  more + * thought is required, but this is patently so. + */ + mask = 0; + for (from = to = string->buffer; *from != '\0'; from++, to++) + { + *to = *from; + + mask |= *to; + if (normalize_attribute && (*to == '\t' || *to == '\n')) + *to = ' '; + if (*to == '\r') + { + *to = normalize_attribute ? ' ' : '\n'; + if (from[1] == '\n') + from++; + } + if (*from == '&') + { + from++; + if (*from == '#') + { + int base = 10; + unsigned long l; + char *end = NULL; + + from++; + + if (*from == 'x') + { + base = 16; + from++; + } + + errno = 0; + l = strtoul (from, &end, base); + + if (end == from || errno != 0) + { + emit_error (context, + _("out of range when resolving character ref")); + return false; + } + else if (*end != ';') + { + emit_error (context, + _("character reference does not end with a ';'")); + return false; + } + else + { + /* characters XML 1.1 permits */ + if ((0 < l && l <= 0xD7FF) || + (0xE000 <= l && l <= 0xFFFD) || + (0x10000 <= l && l <= 0x10FFFF)) + { + char buf[8]; + int length; + length = u8_uctomb ((uint8_t *) buf, l, 8); + memcpy (to, buf, length); + to += length - 1; + from = end; + if (l >= 0x80) /* not ascii */ + mask |= 0x80; + } + else + { + emit_error (context, _("invalid character reference")); + return false; + } + } + } + + else if (strncmp (from, "lt;", 3) == 0) + { + *to = '<'; + from += 2; + } + else if (strncmp (from, "gt;", 3) == 0) + { + *to = '>'; + from += 2; + } + else if (strncmp (from, "amp;", 4) == 0) + { + *to = '&'; + from += 3; + } + else if (strncmp (from, "quot;", 5) == 0) + { + *to = '"'; + from += 4; + } + else if (strncmp (from, "apos;", 5) == 0) + { + *to = '\''; + from += 4; + } + else + { + if (*from == ';') + emit_error (context, _("empty entity '&;'")); + else + { + const char *end = strchr (from, ';'); + if (end) + emit_error (context, _("unknown entity name")); + else + emit_error (context, _("entity does not end with a ';'")); + } + return false; + } + } + } + + assert (to - string->buffer <= string->buflen); + if (to - string->buffer != string->buflen) + markup_string_truncate (string, to - string->buffer); + + *is_ascii = !(mask & 0x80); + + return true; +} + +static inline bool +advance_char (markup_parse_context_ty *context) +{ + context->iter++; + context->char_number++; + + if (context->iter == context->current_text_end) + return false; + + else if (*context->iter == '\n') + { + context->line_number++; + context->char_number = 1; + } + + return true; +} + +static inline bool +xml_isspace (char c) +{ + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; +} + +static void +skip_spaces (markup_parse_context_ty *context) +{ + do + { + if (!xml_isspace (*context->iter)) + return; + } + while (advance_char (context)); +} + +static void +advance_to_name_end (markup_parse_context_ty *context) +{ + do + { + if (IS_COMMON_NAME_END_CHAR (*(context->iter))) + return; + if (xml_isspace (*(context->iter))) + return; + } + while (advance_char (context)); +} + +static void +add_to_partial (markup_parse_context_ty *context, + const char *text_start, + const char *text_end) +{ + if (context->partial_chunk == NULL) + { /* allocate a new chunk to parse into */ + + context->partial_chunk = markup_string_new (); + } + + if (text_start != text_end) + markup_string_append (context->partial_chunk, + text_start, text_end - text_start); +} + +static inline void +truncate_partial (markup_parse_context_ty *context) +{ + if (context->partial_chunk != NULL) + string_blank (context->partial_chunk); +} + +static inline const char* +current_element (markup_parse_context_ty *context) +{ + const markup_string_ty *string = gl_list_get_at (context->tag_stack, 0); + return string->buffer; +} + +static void +pop_subparser_stack (markup_parse_context_ty *context) +{ + markup_recursion_tracker_ty *tracker; + + assert (gl_list_size (context->subparser_stack) > 0); + + tracker = (markup_recursion_tracker_ty *) gl_list_get_at (context->subparser_stack, 0); + + context->awaiting_pop = true; + + context->user_data = tracker->prev_user_data; + context->parser = tracker->prev_parser; + context->subparser_element = tracker->prev_element; + free (tracker); + + gl_list_remove_at (context->subparser_stack, 0); +} + +static void +push_partial_as_tag (markup_parse_context_ty *context) +{ + gl_list_add_first (context->tag_stack, context->partial_chunk); + context->partial_chunk = NULL; +} + +static void +pop_tag (markup_parse_context_ty *context) +{ + gl_list_remove_at (context->tag_stack, 0); +} + +static void +possibly_finish_subparser (markup_parse_context_ty *context) +{ + if (current_element (context) == context->subparser_element) + pop_subparser_stack (context); +} + +static void +ensure_no_outstanding_subparser (markup_parse_context_ty *context) +{ + context->awaiting_pop = false; +} + +static void +add_attribute (markup_parse_context_ty *context, markup_string_ty *string) +{ + if (context->cur_attr + 2 >= context->alloc_attrs) + { + context->alloc_attrs += 5; /* silly magic number */ + context->attr_names = xrealloc (context->attr_names, sizeof (char *) * context->alloc_attrs); + context->attr_values = xrealloc (context->attr_values, sizeof(char *) * context->alloc_attrs); + } + context->cur_attr++; + context->attr_names[context->cur_attr] = xstrdup (string->buffer); + context->attr_values[context->cur_attr] = NULL; + context->attr_names[context->cur_attr+1] = NULL; + context->attr_values[context->cur_attr+1] = NULL; +} + +static void +clear_attributes (markup_parse_context_ty *context) +{ + /* Go ahead and free the attributes. */ + for (; context->cur_attr >= 0; context->cur_attr--) + { + int pos = context->cur_attr; + free (context->attr_names[pos]); + free (context->attr_values[pos]); + context->attr_names[pos] = context->attr_values[pos] = NULL; + } + assert (context->cur_attr == -1); + assert (context->attr_names == NULL || + context->attr_names[0] == NULL); + assert (context->attr_values == NULL || + context->attr_values[0] == NULL); +} + +static void +markup_parse_context_push (markup_parse_context_ty *context, + const markup_parser_ty *parser, + void *user_data) +{ + markup_recursion_tracker_ty *tracker; + + tracker = XMALLOC (markup_recursion_tracker_ty); + tracker->prev_element = context->subparser_element; + tracker->prev_parser = context->parser; + tracker->prev_user_data = context->user_data; + + context->subparser_element = current_element (context); + context->parser = parser; + context->user_data = user_data; + + gl_list_add_first (context->subparser_stack, tracker); +} + +static void +markup_parse_context_pop (markup_parse_context_ty *context) +{ + if (!context->awaiting_pop) + possibly_finish_subparser (context); + + assert (context->awaiting_pop); + + context->awaiting_pop = false; +} + +/* This has to be a separate function to ensure the alloca's + * are unwound on exit - otherwise we grow & blow the stack + * with large documents + */ +static inline void +emit_start_element (markup_parse_context_ty *context) +{ + int i, j = 0; + const char *start_name; + const char **attr_names; + const char **attr_values; + + /* In case we want to ignore qualified tags and we see that we have + * one here, we push a subparser. This will ignore all tags inside of + * the qualified tag. + * + * We deal with the end of the subparser from emit_end_element. + */ + if ((context->flags & MARKUP_IGNORE_QUALIFIED) + && strchr (current_element (context), ':')) + { + static const markup_parser_ty ignore_parser; + markup_parse_context_push (context, &ignore_parser, NULL); + clear_attributes (context); + return; + } + + attr_names = XCALLOC (context->cur_attr + 2, const char *); + attr_values = XCALLOC (context->cur_attr + 2, const char *); + for (i = 0; i < context->cur_attr + 1; i++) + { + /* Possibly omit qualified attribute names from the list */ + if ((context->flags & MARKUP_IGNORE_QUALIFIED) + && strchr (context->attr_names[i], ':')) + continue; + + attr_names[j] = context->attr_names[i]; + attr_values[j] = context->attr_values[i]; + j++; + } + attr_names[j] = NULL; + attr_values[j] = NULL; + + /* Call user callback for element start */ + start_name = current_element (context); + + if (context->parser->start_element && name_validate (context, start_name)) + (* context->parser->start_element) (context, + start_name, + (const char **)attr_names, + (const char **)attr_values, + context->user_data); + free (attr_names); + free (attr_values); + clear_attributes (context); +} + +static void +emit_end_element (markup_parse_context_ty *context) +{ + assert (gl_list_size (context->tag_stack) != 0); + + possibly_finish_subparser (context); + + /* We might have just returned from our ignore subparser */ + if ((context->flags & MARKUP_IGNORE_QUALIFIED) + && strchr (current_element (context), ':')) + { + markup_parse_context_pop (context); + pop_tag (context); + return; + } + + if (context->parser->end_element) + (* context->parser->end_element) (context, + current_element (context), + context->user_data); + + ensure_no_outstanding_subparser (context); + + pop_tag (context); +} + +/* Feed some data to the parse context. The data need not be valid + UTF-8; an error will be signaled if it's invalid. The data need + not be an entire document; you can feed a document into the parser + incrementally, via multiple calls to this function. Typically, as + you receive data from a network connection or file, you feed each + received chunk of data into this function, aborting the process if + an error occurs. Once an error is reported, no further data may be + fed to the parse context; all errors are fatal. */ +bool +markup_parse_context_parse (markup_parse_context_ty *context, + const char *text, + ssize_t text_len) +{ + assert (context != NULL); + assert (text != NULL); + assert (context->state != STATE_ERROR); + assert (!context->parsing); + + if (text_len < 0) + text_len = strlen (text); + + if (text_len == 0) + return true; + + context->parsing = true; + + + context->current_text = text; + context->current_text_len = text_len; + context->current_text_end = context->current_text + text_len; + context->iter = context->current_text; + context->start = context->iter; + + while (context->iter != context->current_text_end) + { + switch (context->state) + { + case STATE_START: + /* Possible next state: AFTER_OPEN_ANGLE */ + + assert (gl_list_size (context->tag_stack) == 0); + + /* whitespace is ignored outside of any elements */ + skip_spaces (context); + + if (context->iter != context->current_text_end) + { + if (*context->iter == '<') + { + /* Move after the open angle */ + advance_char (context); + + context->state = STATE_AFTER_OPEN_ANGLE; + + /* this could start a passthrough */ + context->start = context->iter; + + /* document is now non-empty */ + context->document_empty = false; + } + else + { + emit_error (context, + _("document must begin with an element")); + } + } + break; + + case STATE_AFTER_OPEN_ANGLE: + /* Possible next states: INSIDE_OPEN_TAG_NAME, + * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH + */ + if (*context->iter == '?' || + *context->iter == '!') + { + /* include < in the passthrough */ + const char *openangle = "<"; + add_to_partial (context, openangle, openangle + 1); + context->start = context->iter; + context->balance = 1; + context->state = STATE_INSIDE_PASSTHROUGH; + } + else if (*context->iter == '/') + { + /* move after it */ + advance_char (context); + + context->state = STATE_AFTER_CLOSE_TAG_SLASH; + } + else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) + { + context->state = STATE_INSIDE_OPEN_TAG_NAME; + + /* start of tag name */ + context->start = context->iter; + } + else + { + emit_error (context, _("invalid character after '<'")); + } + break; + + /* The AFTER_CLOSE_ANGLE state is actually sort of + * broken, because it doesn't correspond to a range + * of characters in the input stream as the others do, + * and thus makes things harder to conceptualize + */ + case STATE_AFTER_CLOSE_ANGLE: + /* Possible next states: INSIDE_TEXT, STATE_START */ + if (gl_list_size (context->tag_stack) == 0) + { + context->start = NULL; + context->state = STATE_START; + } + else + { + context->start = context->iter; + context->state = STATE_INSIDE_TEXT; + } + break; + + case STATE_AFTER_ELISION_SLASH: + /* Possible next state: AFTER_CLOSE_ANGLE */ + if (*context->iter == '>') + { + /* move after the close angle */ + advance_char (context); + context->state = STATE_AFTER_CLOSE_ANGLE; + emit_end_element (context); + } + else + { + emit_error (context, _("missing '>'")); + } + break; + + case STATE_INSIDE_OPEN_TAG_NAME: + /* Possible next states: BETWEEN_ATTRIBUTES */ + + /* if there's a partial chunk then it's the first part of the + * tag name. If there's a context->start then it's the start + * of the tag name in current_text, the partial chunk goes + * before that start though. + */ + advance_to_name_end (context); + + if (context->iter == context->current_text_end) + { + /* The name hasn't necessarily ended. Merge with + * partial chunk, leave state unchanged. + */ + add_to_partial (context, context->start, context->iter); + } + else + { + /* The name has ended. Combine it with the partial chunk + * if any; push it on the stack; enter next state. + */ + add_to_partial (context, context->start, context->iter); + push_partial_as_tag (context); + + context->state = STATE_BETWEEN_ATTRIBUTES; + context->start = NULL; + } + break; + + case STATE_INSIDE_ATTRIBUTE_NAME: + /* Possible next states: AFTER_ATTRIBUTE_NAME */ + + advance_to_name_end (context); + add_to_partial (context, context->start, context->iter); + + /* read the full name, if we enter the equals sign state + * then add the attribute to the list (without the value), + * otherwise store a partial chunk to be prepended later. + */ + if (context->iter != context->current_text_end) + context->state = STATE_AFTER_ATTRIBUTE_NAME; + break; + + case STATE_AFTER_ATTRIBUTE_NAME: + /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */ + + skip_spaces (context); + + if (context->iter != context->current_text_end) + { + /* The name has ended. Combine it with the partial chunk + * if any; push it on the stack; enter next state. + */ + if (!name_validate (context, context->partial_chunk->buffer)) + break; + + add_attribute (context, context->partial_chunk); + + markup_string_free (context->partial_chunk, true); + context->partial_chunk = NULL; + context->start = NULL; + + if (*context->iter == '=') + { + advance_char (context); + context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN; + } + else + { + emit_error (context, _("missing '='")); + } + } + break; + + case STATE_BETWEEN_ATTRIBUTES: + /* Possible next states: AFTER_CLOSE_ANGLE, + * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME + */ + skip_spaces (context); + + if (context->iter != context->current_text_end) + { + if (*context->iter == '/') + { + advance_char (context); + context->state = STATE_AFTER_ELISION_SLASH; + } + else if (*context->iter == '>') + { + advance_char (context); + context->state = STATE_AFTER_CLOSE_ANGLE; + } + else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) + { + context->state = STATE_INSIDE_ATTRIBUTE_NAME; + /* start of attribute name */ + context->start = context->iter; + } + else + { + emit_error (context, _("missing '>' or '/'")); + } + + /* If we're done with attributes, invoke + * the start_element callback + */ + if (context->state == STATE_AFTER_ELISION_SLASH || + context->state == STATE_AFTER_CLOSE_ANGLE) + emit_start_element (context); + } + break; + + case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN: + /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */ + + skip_spaces (context); + + if (context->iter != context->current_text_end) + { + if (*context->iter == '"') + { + advance_char (context); + context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ; + context->start = context->iter; + } + else if (*context->iter == '\'') + { + advance_char (context); + context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ; + context->start = context->iter; + } + else + { + emit_error (context, _("missing opening quote")); + } + } + break; + + case STATE_INSIDE_ATTRIBUTE_VALUE_SQ: + case STATE_INSIDE_ATTRIBUTE_VALUE_DQ: + /* Possible next states: BETWEEN_ATTRIBUTES */ + { + char delim; + + if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ) + { + delim = '\''; + } + else + { + delim = '"'; + } + + do + { + if (*context->iter == delim) + break; + } + while (advance_char (context)); + } + if (context->iter == context->current_text_end) + { + /* The value hasn't necessarily ended. Merge with + * partial chunk, leave state unchanged. + */ + add_to_partial (context, context->start, context->iter); + } + else + { + bool is_ascii; + /* The value has ended at the quote mark. Combine it + * with the partial chunk if any; set it for the current + * attribute. + */ + add_to_partial (context, context->start, context->iter); + + assert (context->cur_attr >= 0); + + if (unescape_string_inplace (context, context->partial_chunk, + &is_ascii) + && (is_ascii + || text_validate (context, + context->partial_chunk->buffer, + context->partial_chunk->buflen))) + { + /* success, advance past quote and set state. */ + context->attr_values[context->cur_attr] = + markup_string_free (context->partial_chunk, false); + context->partial_chunk = NULL; + advance_char (context); + context->state = STATE_BETWEEN_ATTRIBUTES; + context->start = NULL; + } + + truncate_partial (context); + } + break; + + case STATE_INSIDE_TEXT: + /* Possible next states: AFTER_OPEN_ANGLE */ + do + { + if (*context->iter == '<') + break; + } + while (advance_char (context)); + + /* The text hasn't necessarily ended. Merge with + * partial chunk, leave state unchanged. + */ + + add_to_partial (context, context->start, context->iter); + + if (context->iter != context->current_text_end) + { + bool is_ascii; + + /* The text has ended at the open angle. Call the text + * callback. + */ + if (unescape_string_inplace (context, context->partial_chunk, + &is_ascii) + && (is_ascii + || text_validate (context, + context->partial_chunk->buffer, + context->partial_chunk->buflen))) + { + if (context->parser->text) + (*context->parser->text) (context, + context->partial_chunk->buffer, + context->partial_chunk->buflen, + context->user_data); + + /* advance past open angle and set state. */ + advance_char (context); + context->state = STATE_AFTER_OPEN_ANGLE; + /* could begin a passthrough */ + context->start = context->iter; + } + + truncate_partial (context); + } + break; + + case STATE_AFTER_CLOSE_TAG_SLASH: + /* Possible next state: INSIDE_CLOSE_TAG_NAME */ + if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) + { + context->state = STATE_INSIDE_CLOSE_TAG_NAME; + + /* start of tag name */ + context->start = context->iter; + } + else + { + emit_error (context, _("invalid character after '</'")); + } + break; + + case STATE_INSIDE_CLOSE_TAG_NAME: + /* Possible next state: AFTER_CLOSE_TAG_NAME */ + advance_to_name_end (context); + add_to_partial (context, context->start, context->iter); + + if (context->iter != context->current_text_end) + context->state = STATE_AFTER_CLOSE_TAG_NAME; + break; + + case STATE_AFTER_CLOSE_TAG_NAME: + /* Possible next state: AFTER_CLOSE_TAG_SLASH */ + + skip_spaces (context); + + if (context->iter != context->current_text_end) + { + markup_string_ty *close_name; + + close_name = context->partial_chunk; + context->partial_chunk = NULL; + + if (*context->iter != '>') + { + emit_error (context, + _("invalid character after a close element name")); + } + else if (gl_list_size (context->tag_stack) == 0) + { + emit_error (context, _("element is closed")); + } + else if (strcmp (close_name->buffer, current_element (context)) != 0) + { + emit_error (context, _("element is closed")); + } + else + { + advance_char (context); + context->state = STATE_AFTER_CLOSE_ANGLE; + context->start = NULL; + + emit_end_element (context); + } + context->partial_chunk = close_name; + truncate_partial (context); + } + break; + + case STATE_INSIDE_PASSTHROUGH: + /* Possible next state: AFTER_CLOSE_ANGLE */ + do + { + if (*context->iter == '<') + context->balance++; + if (*context->iter == '>') + { + char *str; + size_t len; + + context->balance--; + add_to_partial (context, context->start, context->iter); + context->start = context->iter; + + str = context->partial_chunk->buffer; + len = context->partial_chunk->buflen; + + if (str[1] == '?' && str[len - 1] == '?') + break; + if (strncmp (str, "<!--", 4) == 0 && + strcmp (str + len - 2, "--") == 0) + break; + if (strncmp (str, "<![CDATA[", 9) == 0 && + strcmp (str + len - 2, "]]") == 0) + break; + if (strncmp (str, "<!DOCTYPE", 9) == 0 && + context->balance == 0) + break; + } + } + while (advance_char (context)); + + if (context->iter == context->current_text_end) + { + /* The passthrough hasn't necessarily ended. Merge with + * partial chunk, leave state unchanged. + */ + add_to_partial (context, context->start, context->iter); + } + else + { + /* The passthrough has ended at the close angle. Combine + * it with the partial chunk if any. Call the passthrough + * callback. Note that the open/close angles are + * included in the text of the passthrough. + */ + advance_char (context); /* advance past close angle */ + add_to_partial (context, context->start, context->iter); + + if (context->flags & MARKUP_TREAT_CDATA_AS_TEXT && + strncmp (context->partial_chunk->buffer, "<![CDATA[", 9) == 0) + { + if (context->parser->text && + text_validate (context, + context->partial_chunk->buffer + 9, + context->partial_chunk->buflen - 12)) + (*context->parser->text) (context, + context->partial_chunk->buffer + 9, + context->partial_chunk->buflen - 12, + context->user_data); + } + else if (context->parser->passthrough && + text_validate (context, + context->partial_chunk->buffer, + context->partial_chunk->buflen)) + (*context->parser->passthrough) (context, + context->partial_chunk->buffer, + context->partial_chunk->buflen, + context->user_data); + + truncate_partial (context); + + context->state = STATE_AFTER_CLOSE_ANGLE; + context->start = context->iter; /* could begin text */ + } + break; + + case STATE_ERROR: + goto finished; + break; + + default: + abort (); + break; + } + } + + finished: + context->parsing = false; + + return context->state != STATE_ERROR; +} + +/* Signals to the parse context that all data has been fed into the + * parse context with markup_parse_context_parse. + * + * This function reports an error if the document isn't complete, + * for example if elements are still open. */ +bool +markup_parse_context_end_parse (markup_parse_context_ty *context) +{ + assert (context != NULL); + assert (!context->parsing); + assert (context->state != STATE_ERROR); + + if (context->partial_chunk != NULL) + { + markup_string_free (context->partial_chunk, true); + context->partial_chunk = NULL; + } + + if (context->document_empty) + { + emit_error (context, _("empty document")); + return false; + } + + context->parsing = true; + + switch (context->state) + { + case STATE_START: + /* Nothing to do */ + break; + + case STATE_AFTER_OPEN_ANGLE: + emit_error (context, + _("document ended unexpectedly just after '<'")); + break; + + case STATE_AFTER_CLOSE_ANGLE: + if (gl_list_size (context->tag_stack) > 0) + { + /* Error message the same as for INSIDE_TEXT */ + emit_error (context, + _("document ended unexpectedly with elements still open")); + } + break; + + case STATE_AFTER_ELISION_SLASH: + emit_error (context, _("document ended unexpectedly without '>'")); + break; + + case STATE_INSIDE_OPEN_TAG_NAME: + emit_error (context, + _("document ended unexpectedly inside an element name")); + break; + + case STATE_INSIDE_ATTRIBUTE_NAME: + case STATE_AFTER_ATTRIBUTE_NAME: + emit_error (context, + _("document ended unexpectedly inside an attribute name")); + break; + + case STATE_BETWEEN_ATTRIBUTES: + emit_error (context, + _("document ended unexpectedly inside an open tag")); + break; + + case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN: + emit_error (context, _("document ended unexpectedly after '='")); + break; + + case STATE_INSIDE_ATTRIBUTE_VALUE_SQ: + case STATE_INSIDE_ATTRIBUTE_VALUE_DQ: + emit_error (context, + _("document ended unexpectedly inside an attribute value")); + break; + + case STATE_INSIDE_TEXT: + assert (gl_list_size (context->tag_stack) > 0); + emit_error (context, + _("document ended unexpectedly with elements still open")); + break; + + case STATE_AFTER_CLOSE_TAG_SLASH: + case STATE_INSIDE_CLOSE_TAG_NAME: + case STATE_AFTER_CLOSE_TAG_NAME: + emit_error (context, + _("document ended unexpectedly inside the close tag")); + break; + + case STATE_INSIDE_PASSTHROUGH: + emit_error (context, + _("document ended unexpectedly inside a comment or " + "processing instruction")); + break; + + case STATE_ERROR: + default: + abort (); + break; + } + + context->parsing = false; + + return context->state != STATE_ERROR; +} + +const char * +markup_parse_context_get_error (markup_parse_context_ty *context) +{ + return context->error_text; +} diff --git a/gnulib-local/lib/markup.h b/gnulib-local/lib/markup.h new file mode 100644 index 0000000..61e5b0e --- /dev/null +++ b/gnulib-local/lib/markup.h @@ -0,0 +1,164 @@ +/* markup.h -- simple XML-like string parser + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is not part of the GNU gettext program, but is used with + GNU gettext. + + This is a stripped down version of GLib's gmarkup.h. The original + copyright notice is as follows: + */ + +/* gmarkup.h - Simple XML-like string parser/writer + * + * Copyright 2000 Red Hat, Inc. + * + * GLib is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 3 of the + * License, or (at your option) any later version. + * + * GLib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with GLib; see the file COPYING.LIB. If not, + * see <http://www.gnu.org/licenses/>. + */ + +#ifndef __MARKUP_H__ +#define __MARKUP_H__ 1 + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> +#include <stddef.h> +#include <sys/types.h> + +/** + * markup_parse_flags_ty: + * @MARKUP_DO_NOT_USE_THIS_UNSUPPORTED_FLAG: flag you should not use + * @MARKUP_TREAT_CDATA_AS_TEXT: When this flag is set, CDATA marked + * sections are not passed literally to the @passthrough function of + * the parser. Instead, the content of the section (without the + * `<![CDATA[` and `]]>`) is + * passed to the @text function. This flag was added in GLib 2.12 + * @MARKUP_PREFIX_ERROR_POSITION: Normally errors caught by GMarkup + * itself have line/column information prefixed to them to let the + * caller know the location of the error. When this flag is set the + * location information is also prefixed to errors generated by the + * #GMarkupParser implementation functions + * @MARKUP_IGNORE_QUALIFIED: Ignore (don't report) qualified + * attributes and tags, along with their contents. A qualified + * attribute or tag is one that contains ':' in its name (ie: is in + * another namespace). Since: 2.40. + * + * Flags that affect the behaviour of the parser. + */ +typedef enum + { + MARKUP_DO_NOT_USE_THIS_UNSUPPORTED_FLAG = 1 << 0, + MARKUP_TREAT_CDATA_AS_TEXT = 1 << 1, + MARKUP_PREFIX_ERROR_POSITION = 1 << 2, + MARKUP_IGNORE_QUALIFIED = 1 << 3 + } markup_parse_flags_ty; + +/** + * markup_parse_context_ty: + * + * A parse context is used to parse a stream of bytes that + * you expect to contain marked-up text. + * + * See markup_parse_context_new(), #markup_parser_ty, and so + * on for more details. + */ +typedef struct _markup_parse_context_ty markup_parse_context_ty; +typedef struct _markup_parser_ty markup_parser_ty; + +/** + * markup_parser_ty: + * @start_element: Callback to invoke when the opening tag of an element + * is seen. The callback's @attribute_names and @attribute_values parameters + * are %NULL-terminated. + * @end_element: Callback to invoke when the closing tag of an element + * is seen. Note that this is also called for empty tags like + * `<empty/>`. + * @text: Callback to invoke when some text is seen (text is always + * inside an element). Note that the text of an element may be spread + * over multiple calls of this function. If the + * %MARKUP_TREAT_CDATA_AS_TEXT flag is set, this function is also + * called for the content of CDATA marked sections. + * @passthrough: Callback to invoke for comments, processing instructions + * and doctype declarations; if you're re-writing the parsed document, + * write the passthrough text back out in the same position. If the + * %MARKUP_TREAT_CDATA_AS_TEXT flag is not set, this function is also + * called for CDATA marked sections. + * @error: Callback to invoke when an error occurs. + * + * Any of the fields in #markup_parser_ty can be %NULL, in which case they + * will be ignored. Except for the @error function, any of these callbacks + * can set an error; in particular the %MARKUP_ERROR_UNKNOWN_ELEMENT, + * %MARKUP_ERROR_UNKNOWN_ATTRIBUTE, and %MARKUP_ERROR_INVALID_CONTENT + * errors are intended to be set from these callbacks. If you set an error + * from a callback, markup_parse_context_parse() will report that error + * back to its caller. + */ +struct _markup_parser_ty +{ + /* Called for open tags <foo bar="baz"> */ + bool (*start_element) (markup_parse_context_ty *context, + const char *element_name, + const char **attribute_names, + const char **attribute_values, + void *user_data); + + /* Called for close tags </foo> */ + bool (*end_element) (markup_parse_context_ty *context, + const char *element_name, + void *user_data); + + /* Called for character data */ + /* text is not nul-terminated */ + bool (*text) (markup_parse_context_ty *context, + const char *text, + size_t text_len, + void *user_data); + + /* Called for strings that should be re-saved verbatim in this same + * position, but are not otherwise interpretable. At the moment + * this includes comments and processing instructions. + */ + /* text is not nul-terminated. */ + bool (*passthrough) (markup_parse_context_ty *context, + const char *passthrough_text, + size_t text_len, + void *user_data); + + /* Called on error, including one set by other + * methods in the vtable. The GError should not be freed. + */ + void (*error) (markup_parse_context_ty *context, + const char *error_text, + void *user_data); +}; + +extern markup_parse_context_ty * + markup_parse_context_new (const markup_parser_ty *parser, + markup_parse_flags_ty flags, + void *user_data); +extern void markup_parse_context_free (markup_parse_context_ty *context); +extern bool markup_parse_context_parse (markup_parse_context_ty *context, + const char *text, + ssize_t text_len); +extern bool markup_parse_context_end_parse (markup_parse_context_ty *context); +extern const char * + markup_parse_context_get_error (markup_parse_context_ty *context); + +#ifdef __cplusplus +} +#endif + +#endif /* __MARKUP_H__ */ diff --git a/gnulib-local/modules/markup b/gnulib-local/modules/markup new file mode 100644 index 0000000..8d969d4 --- /dev/null +++ b/gnulib-local/modules/markup @@ -0,0 +1,31 @@ +Description: +Simple XML-like parser + +Files: +lib/markup.h +lib/markup.c + +Depends-on: +c-ctype +linked-list +unistr/u8-mbtouc +unistr/u8-next +unictype/ctype-alpha +xalloc +xlist +xvasprintf + +configure.ac: + +Makefile.am: +lib_SOURCES += markup.h markup.c + +Include: +"markup.h" + +License: +LGPL + +Maintainer: +Daiki Ueno + |