diff options
Diffstat (limited to 'gnulib-local/lib/libxml/xmlregexp.c')
-rw-r--r-- | gnulib-local/lib/libxml/xmlregexp.c | 750 |
1 files changed, 479 insertions, 271 deletions
diff --git a/gnulib-local/lib/libxml/xmlregexp.c b/gnulib-local/lib/libxml/xmlregexp.c index e7d519e..3e912ab 100644 --- a/gnulib-local/lib/libxml/xmlregexp.c +++ b/gnulib-local/lib/libxml/xmlregexp.c @@ -1,7 +1,7 @@ /* * regexp.c: generic and extensible Regular Expression engine * - * Basically designed with the purpose of compiling regexps for + * Basically designed with the purpose of compiling regexps for * the variety of validation/shemas mechanisms now available in * XML related specifications these include: * - XML-1.0 DTD validation @@ -44,6 +44,9 @@ #define MAX_PUSH 10000000 +#ifdef ERROR +#undef ERROR +#endif #define ERROR(str) \ ctxt->error = XML_REGEXP_COMPILE_ERROR; \ xmlRegexpErrCompile(ctxt, str); @@ -54,21 +57,26 @@ #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) #define NEXTL(l) ctxt->cur += l; #define XML_REG_STRING_SEPARATOR '|' +/* + * Need PREV to check on a '-' within a Character Group. May only be used + * when it's guaranteed that cur is not at the beginning of ctxt->string! + */ +#define PREV (ctxt->cur[-1]) /** * TODO: * * macro to flag unimplemented blocks */ -#define TODO \ +#define TODO \ xmlGenericError(xmlGenericErrorContext, \ "Unimplemented block at %s:%d\n", \ __FILE__, __LINE__); /************************************************************************ - * * - * Datatypes and structures * - * * + * * + * Datatypes and structures * + * * ************************************************************************/ /* @@ -145,7 +153,8 @@ typedef enum { XML_REGEXP_START_STATE = 1, XML_REGEXP_FINAL_STATE, XML_REGEXP_TRANS_STATE, - XML_REGEXP_SINK_STATE + XML_REGEXP_SINK_STATE, + XML_REGEXP_UNREACH_STATE } xmlRegStateType; typedef enum { @@ -183,6 +192,7 @@ struct _xmlRegAtom { int neg; int codepoint; xmlRegStatePtr start; + xmlRegStatePtr start0; xmlRegStatePtr stop; int maxRanges; int nbRanges; @@ -212,6 +222,7 @@ struct _xmlRegTrans { struct _xmlAutomataState { xmlRegStateType type; xmlRegMarkedType mark; + xmlRegMarkedType markd; xmlRegMarkedType reached; int no; int maxTrans; @@ -226,6 +237,8 @@ struct _xmlAutomataState { typedef struct _xmlAutomata xmlRegParserCtxt; typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; +#define AM_AUTOMATA_RNG 1 + struct _xmlAutomata { xmlChar *string; xmlChar *cur; @@ -253,6 +266,7 @@ struct _xmlAutomata { int determinist; int negs; + int flags; }; struct _xmlRegexp { @@ -264,6 +278,7 @@ struct _xmlRegexp { int nbCounters; xmlRegCounter *counters; int determinist; + int flags; /* * That's the compact form for determinists automatas */ @@ -346,9 +361,11 @@ static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, int start, int end, const xmlChar *blockName); +void xmlAutomataSetFlags(xmlAutomataPtr am, int flags); + /************************************************************************ * * - * Regexp memory error handler * + * Regexp memory error handler * * * ************************************************************************/ /** @@ -395,9 +412,9 @@ xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) } /************************************************************************ - * * - * Allocation/Deallocation * - * * + * * + * Allocation/Deallocation * + * * ************************************************************************/ static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); @@ -427,6 +444,7 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { ret->nbCounters = ctxt->nbCounters; ret->counters = ctxt->counters; ret->determinist = ctxt->determinist; + ret->flags = ctxt->flags; if (ret->determinist == -1) { xmlRegexpIsDeterminist(ret); } @@ -727,11 +745,41 @@ xmlRegFreeRange(xmlRegRangePtr range) { } /** + * xmlRegCopyRange: + * @range: the regexp range + * + * Copy a regexp range + * + * Returns the new copy or NULL in case of error. + */ +static xmlRegRangePtr +xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) { + xmlRegRangePtr ret; + + if (range == NULL) + return(NULL); + + ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start, + range->end); + if (ret == NULL) + return(NULL); + if (range->blockName != NULL) { + ret->blockName = xmlStrdup(range->blockName); + if (ret->blockName == NULL) { + xmlRegexpErrMemory(ctxt, "allocating range"); + xmlRegFreeRange(ret); + return(NULL); + } + } + return(ret); +} + +/** * xmlRegNewAtom: * @ctxt: the regexp parser context * @type: the type of atom * - * Allocate a new regexp range + * Allocate a new atom * * Returns the new atom or NULL in case of error */ @@ -778,6 +826,52 @@ xmlRegFreeAtom(xmlRegAtomPtr atom) { xmlFree(atom); } +/** + * xmlRegCopyAtom: + * @ctxt: the regexp parser context + * @atom: the oiginal atom + * + * Allocate a new regexp range + * + * Returns the new atom or NULL in case of error + */ +static xmlRegAtomPtr +xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { + xmlRegAtomPtr ret; + + ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); + if (ret == NULL) { + xmlRegexpErrMemory(ctxt, "copying atom"); + return(NULL); + } + memset(ret, 0, sizeof(xmlRegAtom)); + ret->type = atom->type; + ret->quant = atom->quant; + ret->min = atom->min; + ret->max = atom->max; + if (atom->nbRanges > 0) { + int i; + + ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) * + atom->nbRanges); + if (ret->ranges == NULL) { + xmlRegexpErrMemory(ctxt, "copying atom"); + goto error; + } + for (i = 0;i < atom->nbRanges;i++) { + ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]); + if (ret->ranges[i] == NULL) + goto error; + ret->nbRanges = i + 1; + } + } + return(ret); + +error: + xmlRegFreeAtom(ret); + return(NULL); +} + static xmlRegStatePtr xmlRegNewState(xmlRegParserCtxtPtr ctxt) { xmlRegStatePtr ret; @@ -841,9 +935,9 @@ xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { } /************************************************************************ - * * - * Display of Data structures * - * * + * * + * Display of Data structures * + * * ************************************************************************/ static void @@ -1050,7 +1144,7 @@ xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { fprintf(output, "char %c ", trans->atom->codepoint); fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); } - + static void xmlRegPrintState(FILE *output, xmlRegStatePtr state) { int i; @@ -1064,7 +1158,7 @@ xmlRegPrintState(FILE *output, xmlRegStatePtr state) { fprintf(output, "START "); if (state->type == XML_REGEXP_FINAL_STATE) fprintf(output, "FINAL "); - + fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); for (i = 0;i < state->nbTrans; i++) { xmlRegPrintTrans(output, &(state->trans[i])); @@ -1114,12 +1208,12 @@ xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { #endif /************************************************************************ - * * + * * * Finite Automata structures manipulations * - * * + * * ************************************************************************/ -static void +static void xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, int neg, xmlRegAtomType type, int start, int end, xmlChar *blockName) { @@ -1159,7 +1253,7 @@ xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, return; range->blockName = blockName; atom->ranges[atom->nbRanges++] = range; - + } static int @@ -1190,7 +1284,7 @@ xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { return(ctxt->nbCounters++); } -static int +static int xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { if (atom == NULL) { ERROR("atom push: atom is NULL"); @@ -1222,7 +1316,7 @@ xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { return(0); } -static void +static void xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, int from) { if (target->maxTransTo == 0) { @@ -1250,7 +1344,7 @@ xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, target->nbTransTo++; } -static void +static void xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, xmlRegAtomPtr atom, xmlRegStatePtr target, int counter, int count) { @@ -1316,7 +1410,7 @@ xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, printf("counted %d\n", counter); else if (atom == NULL) printf("epsilon transition\n"); - else if (atom != NULL) + else if (atom != NULL) xmlRegPrintAtom(stdout, atom); #endif @@ -1449,6 +1543,8 @@ xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, static int xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, xmlRegStatePtr to, xmlRegAtomPtr atom) { + xmlRegStatePtr end; + if (atom == NULL) { ERROR("genrate transition: atom == NULL"); return(-1); @@ -1468,7 +1564,7 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, */ xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); #ifdef DV - } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && + } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && (atom->quant != XML_REGEXP_QUANT_ONCE)) { to = xmlRegNewState(ctxt); xmlRegStatePush(ctxt, to); @@ -1482,10 +1578,15 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, /* * transition done to the state after end of atom. * 1. set transition from atom start to new state - * 2. set transition from atom end to this state. + * 2. set transition from atom end to this state. */ - xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); - xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state); + if (to == NULL) { + xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); + xmlFAGenerateEpsilonTransition(ctxt, atom->stop, + ctxt->state); + } else { + xmlFAGenerateEpsilonTransition(ctxt, atom->start, to); + } break; case XML_REGEXP_QUANT_MULT: atom->quant = XML_REGEXP_QUANT_ONCE; @@ -1498,58 +1599,89 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, break; case XML_REGEXP_QUANT_RANGE: { int counter; - xmlRegStatePtr newstate; + xmlRegStatePtr inter, newstate; /* - * This one is nasty: - * 1/ if range has minOccurs == 0, create a new state - * and create epsilon transitions from atom->start - * to atom->stop, as well as atom->start to the new - * state - * 2/ register a new counter - * 3/ register an epsilon transition associated to - * this counter going from atom->stop to atom->start - * 4/ create a new state - * 5/ generate a counted transition from atom->stop to - * that state + * create the final state now if needed */ - if (atom->min == 0) { - xmlFAGenerateEpsilonTransition(ctxt, atom->start, - atom->stop); + if (to != NULL) { + newstate = to; + } else { newstate = xmlRegNewState(ctxt); xmlRegStatePush(ctxt, newstate); - ctxt->state = newstate; + } + + /* + * The principle here is to use counted transition + * to avoid explosion in the number of states in the + * graph. This is clearly more complex but should not + * be exploitable at runtime. + */ + if ((atom->min == 0) && (atom->start0 == NULL)) { + xmlRegAtomPtr copy; + /* + * duplicate a transition based on atom to count next + * occurences after 1. We cannot loop to atom->start + * directly because we need an epsilon transition to + * newstate. + */ + /* ???? For some reason it seems we never reach that + case, I suppose this got optimized out before when + building the automata */ + copy = xmlRegCopyAtom(ctxt, atom); + if (copy == NULL) + return(-1); + copy->quant = XML_REGEXP_QUANT_ONCE; + copy->min = 0; + copy->max = 0; + + if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy) + < 0) + return(-1); + inter = ctxt->state; + counter = xmlRegGetCounter(ctxt); + ctxt->counters[counter].min = atom->min - 1; + ctxt->counters[counter].max = atom->max - 1; + /* count the number of times we see it again */ + xmlFAGenerateCountedEpsilonTransition(ctxt, inter, + atom->stop, counter); + /* allow a way out based on the count */ + xmlFAGenerateCountedTransition(ctxt, inter, + newstate, counter); + /* and also allow a direct exit for 0 */ xmlFAGenerateEpsilonTransition(ctxt, atom->start, - newstate); + newstate); + } else { + /* + * either we need the atom at least once or there + * is an atom->start0 allowing to easilly plug the + * epsilon transition. + */ + counter = xmlRegGetCounter(ctxt); + ctxt->counters[counter].min = atom->min - 1; + ctxt->counters[counter].max = atom->max - 1; + /* count the number of times we see it again */ + xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, + atom->start, counter); + /* allow a way out based on the count */ + xmlFAGenerateCountedTransition(ctxt, atom->stop, + newstate, counter); + /* and if needed allow a direct exit for 0 */ + if (atom->min == 0) + xmlFAGenerateEpsilonTransition(ctxt, atom->start0, + newstate); + } - counter = xmlRegGetCounter(ctxt); - ctxt->counters[counter].min = atom->min - 1; - ctxt->counters[counter].max = atom->max - 1; atom->min = 0; atom->max = 0; atom->quant = XML_REGEXP_QUANT_ONCE; - if (to != NULL) { - newstate = to; - } else { - newstate = xmlRegNewState(ctxt); - xmlRegStatePush(ctxt, newstate); - } ctxt->state = newstate; - xmlFAGenerateCountedTransition(ctxt, atom->stop, - newstate, counter); - - /* - * first check count and if OK jump forward; - * if checking fail increment count and jump back - */ - xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, - atom->start, counter); } default: break; } return(0); - } + } if ((atom->min == 0) && (atom->max == 0) && (atom->quant == XML_REGEXP_QUANT_RANGE)) { /* @@ -1576,11 +1708,30 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, return(-1); } } + end = to; + if ((atom->quant == XML_REGEXP_QUANT_MULT) || + (atom->quant == XML_REGEXP_QUANT_PLUS)) { + /* + * Do not pollute the target state by adding transitions from + * it as it is likely to be the shared target of multiple branches. + * So isolate with an epsilon transition. + */ + xmlRegStatePtr tmp; + + tmp = xmlRegNewState(ctxt); + if (tmp != NULL) + xmlRegStatePush(ctxt, tmp); + else { + return(-1); + } + xmlFAGenerateEpsilonTransition(ctxt, tmp, to); + to = tmp; + } if (xmlRegAtomPush(ctxt, atom) < 0) { return(-1); } xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); - ctxt->state = to; + ctxt->state = end; switch (atom->quant) { case XML_REGEXP_QUANT_OPT: atom->quant = XML_REGEXP_QUANT_ONCE; @@ -1595,6 +1746,13 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, atom->quant = XML_REGEXP_QUANT_ONCE; xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); break; + case XML_REGEXP_QUANT_RANGE: +#if DV_test + if (atom->min == 0) { + xmlFAGenerateEpsilonTransition(ctxt, from, to); + } +#endif + break; default: break; } @@ -1605,7 +1763,7 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, * xmlFAReduceEpsilonTransitions: * @ctxt: a regexp parser context * @fromnr: the from state - * @tonr: the to state + * @tonr: the to state * @counter: should that transition be associated to a counted * */ @@ -1649,7 +1807,7 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, int newto = to->trans[transnr].to; xmlRegStateAddTrans(ctxt, from, NULL, - ctxt->states[newto], + ctxt->states[newto], -1, to->trans[transnr].count); } else { #ifdef DEBUG_REGEXP_GRAPH @@ -1671,11 +1829,11 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, int newto = to->trans[transnr].to; if (to->trans[transnr].counter >= 0) { - xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, - ctxt->states[newto], + xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, + ctxt->states[newto], to->trans[transnr].counter, -1); } else { - xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, + xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, ctxt->states[newto], counter, -1); } } @@ -1687,7 +1845,7 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, * xmlFAEliminateSimpleEpsilonTransitions: * @ctxt: a regexp parser context * - * Eliminating general epsilon transitions can get costly in the general + * Eliminating general epsilon transitions can get costly in the general * algorithm due to the large amount of generated new transitions and * associated comparisons. However for simple epsilon transition used just * to separate building blocks when generating the automata this can be @@ -1709,6 +1867,8 @@ xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { continue; if (state->nbTrans != 1) continue; + if (state->type == XML_REGEXP_UNREACH_STATE) + continue; /* is the only transition out a basic transition */ if ((state->trans[0].atom == NULL) && (state->trans[0].to >= 0) && @@ -1721,48 +1881,37 @@ xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { #ifdef DEBUG_REGEXP_GRAPH printf("Found simple epsilon trans from start %d to %d\n", statenr, newto); -#endif +#endif } else { #ifdef DEBUG_REGEXP_GRAPH printf("Found simple epsilon trans from %d to %d\n", statenr, newto); -#endif +#endif for (i = 0;i < state->nbTransTo;i++) { tmp = ctxt->states[state->transTo[i]]; for (j = 0;j < tmp->nbTrans;j++) { if (tmp->trans[j].to == statenr) { - tmp->trans[j].to = newto; -#ifdef DEBUG_REGEXP_GRAPH - printf("Changed transition %d on %d to go to %d\n", - j, tmp->no, newto); -#endif - xmlRegStateAddTransTo(ctxt, ctxt->states[newto], - tmp->no); - } - } - } -#if 0 - for (i = 0;i < ctxt->nbStates;i++) { - tmp = ctxt->states[i]; - for (j = 0;j < tmp->nbTrans;j++) { - if (tmp->trans[j].to == statenr) { - tmp->trans[j].to = newto; #ifdef DEBUG_REGEXP_GRAPH printf("Changed transition %d on %d to go to %d\n", j, tmp->no, newto); -#endif +#endif + tmp->trans[j].to = -1; + xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom, + ctxt->states[newto], + tmp->trans[j].counter, + tmp->trans[j].count); } } } -#endif if (state->type == XML_REGEXP_FINAL_STATE) ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; /* eliminate the transition completely */ state->nbTrans = 0; + state->type = XML_REGEXP_UNREACH_STATE; } - + } } } @@ -1779,16 +1928,33 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { if (ctxt->states == NULL) return; + /* + * Eliminate simple epsilon transition and the associated unreachable + * states. + */ xmlFAEliminateSimpleEpsilonTransitions(ctxt); + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) { +#ifdef DEBUG_REGEXP_GRAPH + printf("Removed unreachable state %d\n", statenr); +#endif + xmlRegFreeState(state); + ctxt->states[statenr] = NULL; + } + } has_epsilon = 0; /* - * build the completed transitions bypassing the epsilons + * Build the completed transitions bypassing the epsilons * Use a marking algorithm to avoid loops - * mark sink states too. + * Mark sink states too. + * Process from the latests states backward to the start when + * there is long cascading epsilon chains this minimize the + * recursions and transition compares when adding the new ones */ - for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) { state = ctxt->states[statenr]; if (state == NULL) continue; @@ -1812,8 +1978,9 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { printf("Found epsilon trans %d from %d to %d\n", transnr, statenr, newto); #endif - state->mark = XML_REGEXP_MARK_START; has_epsilon = 1; + state->trans[transnr].to = -2; + state->mark = XML_REGEXP_MARK_START; xmlFAReduceEpsilonTransitions(ctxt, statenr, newto, state->trans[transnr].counter); state->mark = XML_REGEXP_MARK_NORMAL; @@ -1932,12 +2099,13 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { (range2->type == XML_REGEXP_EPSILON)) { return(0); } else if (range1->type == range2->type) { - if ((range1->type != XML_REGEXP_CHARVAL) || - (range1->end < range2->start) || - (range2->end < range1->start)) - ret = 1; - else + if (range1->type != XML_REGEXP_CHARVAL) + ret = 1; + else if ((range1->end < range2->start) || + (range2->end < range1->start)) ret = 0; + else + ret = 1; } else if (range1->type == XML_REGEXP_CHARVAL) { int codepoint; int neg = 0; @@ -1945,7 +2113,7 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { /* * just check all codepoints in the range for acceptance, * this is usually way cheaper since done only once at - * compilation than testing over and over at runtime or + * compilation than testing over and over at runtime or * pushing too many states when evaluating. */ if (((range1->neg == 0) && (range2->neg != 0)) || @@ -2064,7 +2232,7 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { if (((range1->neg == 0) && (range2->neg != 0)) || ((range1->neg != 0) && (range2->neg == 0))) ret = !ret; - return(1); + return(ret); } /** @@ -2272,6 +2440,7 @@ xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { * xmlFAEqualAtoms: * @atom1: an atom * @atom2: an atom + * @deep: if not set only compare string pointers * * Compares two atoms to check whether they are the same exactly * this is used to remove equivalent transitions @@ -2279,7 +2448,7 @@ xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { * Returns 1 if same and 0 otherwise */ static int -xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { +xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { int ret = 0; if (atom1 == atom2) @@ -2294,8 +2463,11 @@ xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { ret = 0; break; case XML_REGEXP_STRING: - ret = xmlStrEqual((xmlChar *)atom1->valuep, - (xmlChar *)atom2->valuep); + if (!deep) + ret = (atom1->valuep == atom2->valuep); + else + ret = xmlStrEqual((xmlChar *)atom1->valuep, + (xmlChar *)atom2->valuep); break; case XML_REGEXP_CHARVAL: ret = (atom1->codepoint == atom2->codepoint); @@ -2313,6 +2485,7 @@ xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { * xmlFACompareAtoms: * @atom1: an atom * @atom2: an atom + * @deep: if not set only compare string pointers * * Compares two atoms to check whether they intersect in some ways, * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only @@ -2320,7 +2493,7 @@ xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { * Returns 1 if yes and 0 otherwise */ static int -xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { +xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) { int ret = 1; if (atom1 == atom2) @@ -2346,8 +2519,11 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { } switch (atom1->type) { case XML_REGEXP_STRING: - ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, - (xmlChar *)atom2->valuep); + if (!deep) + ret = (atom1->valuep != atom2->valuep); + else + ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, + (xmlChar *)atom2->valuep); break; case XML_REGEXP_EPSILON: goto not_determinist; @@ -2410,9 +2586,16 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, int res; int transnr, nbTrans; xmlRegTransPtr t1; + int deep = 1; if (state == NULL) return(ret); + if (state->markd == XML_REGEXP_MARK_VISITED) + return(ret); + + if (ctxt->flags & AM_AUTOMATA_RNG) + deep = 0; + /* * don't recurse on transitions potentially added in the course of * the elimination. @@ -2424,10 +2607,12 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, * check transitions conflicting with the one looked at */ if (t1->atom == NULL) { - if (t1->to == -1) + if (t1->to < 0) continue; + state->markd = XML_REGEXP_MARK_VISITED; res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], to, atom); + state->markd = 0; if (res == 0) { ret = 0; /* t1->nd = 1; */ @@ -2436,7 +2621,7 @@ xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, } if (t1->to != to) continue; - if (xmlFACompareAtoms(t1->atom, atom)) { + if (xmlFACompareAtoms(t1->atom, atom, deep)) { ret = 0; /* mark the transition as non-deterministic */ t1->nd = 1; @@ -2460,6 +2645,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { xmlRegTransPtr t1, t2, last; int i; int ret = 1; + int deep = 1; #ifdef DEBUG_REGEXP_GRAPH printf("xmlFAComputesDeterminism\n"); @@ -2468,6 +2654,9 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { if (ctxt->determinist != -1) return(ctxt->determinist); + if (ctxt->flags & AM_AUTOMATA_RNG) + deep = 0; + /* * First cleanup the automata removing cancelled transitions */ @@ -2495,7 +2684,13 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { continue; if (t2->atom != NULL) { if (t1->to == t2->to) { - if (xmlFAEqualAtoms(t1->atom, t2->atom)) + /* + * Here we use deep because we want to keep the + * transitions which indicate a conflict + */ + if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) && + (t1->counter == t2->counter) && + (t1->count == t2->count)) t2->to = -1; /* eliminated */ } } @@ -2530,8 +2725,11 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { if (t2->to == -1) /* eliminated */ continue; if (t2->atom != NULL) { - /* not determinist ! */ - if (xmlFACompareAtoms(t1->atom, t2->atom)) { + /* + * But here we don't use deep because we want to + * find transitions which indicate a conflict + */ + if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) { ret = 0; /* mark the transitions as non-deterministic ones */ t1->nd = 1; @@ -2583,9 +2781,9 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { } /************************************************************************ - * * + * * * Routines to check input against transition atoms * - * * + * * ************************************************************************/ static int @@ -2614,7 +2812,7 @@ xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, case XML_REGEXP_NOTINITNAME: neg = !neg; case XML_REGEXP_INITNAME: - ret = (IS_LETTER(codepoint) || + ret = (IS_LETTER(codepoint) || (codepoint == '_') || (codepoint == ':')); break; case XML_REGEXP_NOTNAMECHAR: @@ -2862,9 +3060,9 @@ xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { } /************************************************************************ - * * + * * * Saving and restoring state of an execution context * - * * + * * ************************************************************************/ #ifdef DEBUG_REGEXP_EXEC @@ -2875,7 +3073,8 @@ xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { int i; printf(": "); for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) - printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); + printf("%s ", (const char *) + exec->inputStack[exec->inputStackNr - (i + 1)].value); } else { printf(": %s", &(exec->inputString[exec->index])); } @@ -2963,8 +3162,10 @@ xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { exec->status = -6; return; } - memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, + if (exec->counts) { + memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, exec->comp->nbCounters * sizeof(int)); + } } #ifdef DEBUG_REGEXP_EXEC @@ -2974,9 +3175,9 @@ xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { } /************************************************************************ - * * + * * * Verifier, running an input against a compiled regexp * - * * + * * ************************************************************************/ static int @@ -3008,9 +3209,10 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { memset(exec->counts, 0, comp->nbCounters * sizeof(int)); } else exec->counts = NULL; - while ((exec->status == 0) && + while ((exec->status == 0) && (exec->state != NULL) && ((exec->inputString[exec->index] != 0) || - (exec->state->type != XML_REGEXP_FINAL_STATE))) { + ((exec->state != NULL) && + (exec->state->type != XML_REGEXP_FINAL_STATE)))) { xmlRegTransPtr trans; xmlRegAtomPtr atom; @@ -3081,12 +3283,22 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { * this is a multiple input sequence * If there is a counter associated increment it now. * before potentially saving and rollback + * do not increment if the counter is already over the + * maximum limit in which case get to next transition */ if (trans->counter >= 0) { - if (exec->counts == NULL) { + xmlRegCounterPtr counter; + + if ((exec->counts == NULL) || + (exec->comp == NULL) || + (exec->comp->counters == NULL)) { exec->status = -1; goto error; } + counter = &exec->comp->counters[trans->counter]; + if (exec->counts[trans->counter] >= counter->max) + continue; /* for loop on transitions */ + #ifdef DEBUG_REGEXP_EXEC printf("Increasing count %d\n", trans->counter); #endif @@ -3182,10 +3394,18 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { xmlFARegExecSave(exec); } if (trans->counter >= 0) { - if (exec->counts == NULL) { - exec->status = -1; + xmlRegCounterPtr counter; + + /* make sure we don't go over the counter maximum value */ + if ((exec->counts == NULL) || + (exec->comp == NULL) || + (exec->comp->counters == NULL)) { + exec->status = -1; goto error; } + counter = &exec->comp->counters[trans->counter]; + if (exec->counts[trans->counter] >= counter->max) + continue; /* for loop on transitions */ #ifdef DEBUG_REGEXP_EXEC printf("Increasing count %d\n", trans->counter); #endif @@ -3243,6 +3463,8 @@ error: } xmlFree(exec->rollbacks); } + if (exec->state == NULL) + return(-1); if (exec->counts != NULL) xmlFree(exec->counts); if (exec->status == 0) @@ -3256,9 +3478,9 @@ error: } /************************************************************************ - * * + * * * Progressive interface to the verifier one atom at a time * - * * + * * ************************************************************************/ #ifdef DEBUG_ERR static void testerr(xmlRegExecCtxtPtr exec); @@ -3375,7 +3597,7 @@ xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, #endif if (exec->inputStackMax == 0) { exec->inputStackMax = 4; - exec->inputStack = (xmlRegInputTokenPtr) + exec->inputStack = (xmlRegInputTokenPtr) xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); if (exec->inputStack == NULL) { xmlRegexpErrMemory(NULL, "pushing input string"); @@ -3404,11 +3626,11 @@ xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, /** * xmlRegStrEqualWildcard: - * @expStr: the string to be evaluated + * @expStr: the string to be evaluated * @valStr: the validation string * * Checks if both strings are equal or have the same content. "*" - * can be used as a wildcard in @valStr; "|" is used as a seperator of + * can be used as a wildcard in @valStr; "|" is used as a seperator of * substrings in both @expStr and @valStr. * * Returns 1 if the comparison is satisfied and the number of substrings @@ -3474,7 +3696,7 @@ xmlRegCompactPushString(xmlRegExecCtxtPtr exec, if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) return(-1); - + if (value == NULL) { /* * are we at a final state ? @@ -3495,9 +3717,9 @@ xmlRegCompactPushString(xmlRegExecCtxtPtr exec, for (i = 0;i < comp->nbstrings;i++) { target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; if ((target > 0) && (target <= comp->nbstates)) { - target--; /* to avoid 0 */ + target--; /* to avoid 0 */ if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { - exec->index = target; + exec->index = target; if ((exec->callback != NULL) && (comp->transdata != NULL)) { exec->callback(exec->data, value, comp->transdata[state * comp->nbstrings + i], data); @@ -3631,7 +3853,7 @@ xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, continue; counter = &exec->comp->counters[t->counter]; count = exec->counts[t->counter]; - if ((count < counter->max) && + if ((count < counter->max) && (t->atom != NULL) && (xmlStrEqual(value, t->atom->valuep))) { ret = 0; @@ -3871,7 +4093,7 @@ rollback: */ exec->determinist = 0; xmlFARegExecRollBack(exec); - if (exec->status == 0) { + if ((exec->inputStack != NULL ) && (exec->status == 0)) { value = exec->inputStack[exec->index].value; data = exec->inputStack[exec->index].data; #ifdef DEBUG_PUSH @@ -3989,7 +4211,7 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, int maxval; int nb = 0; - if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || + if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || (values == NULL) || (*nbval <= 0)) return(-1); @@ -4086,7 +4308,7 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, (*nbval)++; } } else { - if ((exec->comp->states[trans->to] != NULL) && + if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) && (exec->comp->states[trans->to]->type != XML_REGEXP_SINK_STATE)) { if (atom->neg) @@ -4095,7 +4317,7 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, values[nb++] = (xmlChar *) atom->valuep; (*nbval)++; } - } + } } for (transno = 0; (transno < state->nbTrans) && (nb < maxval); @@ -4122,7 +4344,7 @@ xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, values[nb++] = (xmlChar *) atom->valuep; (*nbneg)++; } - } + } } } return(0); @@ -4353,10 +4575,10 @@ progress: } #endif /************************************************************************ - * * + * * * Parser for the Schemas Datatype Regular Expressions * * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * - * * + * * ************************************************************************/ /** @@ -4385,7 +4607,7 @@ xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { * * [27] charProp ::= IsCategory | IsBlock * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | - * Separators | Symbols | Others + * Separators | Symbols | Others * [29] Letters ::= 'L' [ultmo]? * [30] Marks ::= 'M' [nce]? * [31] Numbers ::= 'N' [dlo]? @@ -4400,7 +4622,7 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { int cur; xmlRegAtomType type = (xmlRegAtomType) 0; xmlChar *blockName = NULL; - + cur = CUR; if (cur == 'L') { NEXT; @@ -4572,15 +4794,15 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { NEXT; start = ctxt->cur; cur = CUR; - if (((cur >= 'a') && (cur <= 'z')) || - ((cur >= 'A') && (cur <= 'Z')) || - ((cur >= '0') && (cur <= '9')) || + if (((cur >= 'a') && (cur <= 'z')) || + ((cur >= 'A') && (cur <= 'Z')) || + ((cur >= '0') && (cur <= '9')) || (cur == 0x2D)) { NEXT; cur = CUR; - while (((cur >= 'a') && (cur <= 'z')) || - ((cur >= 'A') && (cur <= 'Z')) || - ((cur >= '0') && (cur <= '9')) || + while (((cur >= 'a') && (cur <= 'z')) || + ((cur >= 'A') && (cur <= 'Z')) || + ((cur >= '0') && (cur <= '9')) || (cur == 0x2D)) { NEXT; cur = CUR; @@ -4606,7 +4828,7 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { * xmlFAParseCharClassEsc: * @ctxt: a regexp parser context * - * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) + * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] * [25] catEsc ::= '\p{' charProp '}' * [26] complEsc ::= '\P{' charProp '}' @@ -4682,6 +4904,17 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { } } } else if (ctxt->atom->type == XML_REGEXP_RANGES) { + switch (cur) { + case 'n': + cur = '\n'; + break; + case 'r': + cur = '\r'; + break; + case 't': + cur = '\t'; + break; + } xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, XML_REGEXP_CHARVAL, cur, cur, NULL); } @@ -4692,34 +4925,34 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { xmlRegAtomType type = XML_REGEXP_ANYSPACE; switch (cur) { - case 's': + case 's': type = XML_REGEXP_ANYSPACE; break; - case 'S': + case 'S': type = XML_REGEXP_NOTSPACE; break; - case 'i': + case 'i': type = XML_REGEXP_INITNAME; break; - case 'I': + case 'I': type = XML_REGEXP_NOTINITNAME; break; - case 'c': + case 'c': type = XML_REGEXP_NAMECHAR; break; - case 'C': + case 'C': type = XML_REGEXP_NOTNAMECHAR; break; - case 'd': + case 'd': type = XML_REGEXP_DECIMAL; break; - case 'D': + case 'D': type = XML_REGEXP_NOTDECIMAL; break; - case 'w': + case 'w': type = XML_REGEXP_REALCHAR; break; - case 'W': + case 'W': type = XML_REGEXP_NOTREALCHAR; break; } @@ -4730,72 +4963,16 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, type, 0, 0, NULL); } - } -} - -/** - * xmlFAParseCharRef: - * @ctxt: a regexp parser context - * - * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) - */ -static int -xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { - int ret = 0, cur; - - if ((CUR != '&') || (NXT(1) != '#')) - return(-1); - NEXT; - NEXT; - cur = CUR; - if (cur == 'x') { - NEXT; - cur = CUR; - if (((cur >= '0') && (cur <= '9')) || - ((cur >= 'a') && (cur <= 'f')) || - ((cur >= 'A') && (cur <= 'F'))) { - while (((cur >= '0') && (cur <= '9')) || - ((cur >= 'a') && (cur <= 'f')) || - ((cur >= 'A') && (cur <= 'F'))) { - if ((cur >= '0') && (cur <= '9')) - ret = ret * 16 + cur - '0'; - else if ((cur >= 'a') && (cur <= 'f')) - ret = ret * 16 + 10 + (cur - 'a'); - else - ret = ret * 16 + 10 + (cur - 'A'); - NEXT; - cur = CUR; - } - } else { - ERROR("Char ref: expecting [0-9A-F]"); - return(-1); - } - } else { - if ((cur >= '0') && (cur <= '9')) { - while ((cur >= '0') && (cur <= '9')) { - ret = ret * 10 + cur - '0'; - NEXT; - cur = CUR; - } - } else { - ERROR("Char ref: expecting [0-9]"); - return(-1); - } - } - if (cur != ';') { - ERROR("Char ref: expecting ';'"); - return(-1); } else { - NEXT; + ERROR("Wrong escape sequence, misuse of character '\\'"); } - return(ret); } /** * xmlFAParseCharRange: * @ctxt: a regexp parser context * - * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash + * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash * [18] seRange ::= charOrEsc '-' charOrEsc * [20] charOrEsc ::= XmlChar | SingleCharEsc * [21] XmlChar ::= [^\#x2D#x5B#x5D] @@ -4812,12 +4989,6 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { return; } - if ((CUR == '&') && (NXT(1) == '#')) { - end = start = xmlFAParseCharRef(ctxt); - xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, - XML_REGEXP_CHARVAL, start, end, NULL); - return; - } cur = CUR; if (cur == '\\') { NEXT; @@ -4842,10 +5013,15 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { ERROR("Expecting a char range"); return; } - NEXTL(len); - if (start == '-') { + /* + * Since we are "inside" a range, we can assume ctxt->cur is past + * the start of ctxt->string, and PREV should be safe + */ + if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) { + NEXTL(len); return; } + NEXTL(len); cur = CUR; if ((cur != '-') || (NXT(1) == ']')) { xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, @@ -4896,7 +5072,7 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { static void xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { do { - if ((CUR == '\\') || (CUR == '.')) { + if (CUR == '\\') { xmlFAParseCharClassEsc(ctxt); } else { xmlFAParseCharRange(ctxt); @@ -4911,7 +5087,7 @@ xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { * * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub * [15] negCharGroup ::= '^' posCharGroup - * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr + * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr * [12] charClassExpr ::= '[' charGroup ']' */ static void @@ -5085,9 +5261,15 @@ xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { } else if (CUR == ')') { return(0); } else if (CUR == '(') { - xmlRegStatePtr start, oldend; + xmlRegStatePtr start, oldend, start0; NEXT; + /* + * this extra Epsilon transition is needed if we count with 0 allowed + * unfortunately this can't be known at that point + */ + xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); + start0 = ctxt->state; xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); start = ctxt->state; oldend = ctxt->end; @@ -5103,6 +5285,7 @@ xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { if (ctxt->atom == NULL) return(-1); ctxt->atom->start = start; + ctxt->atom->start0 = start0; ctxt->atom->stop = ctxt->state; ctxt->end = oldend; return(1); @@ -5152,7 +5335,7 @@ xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { previous = ctxt->state; ret = xmlFAParsePiece(ctxt); if (ret != 0) { - if (xmlFAGenerateTransitions(ctxt, previous, + if (xmlFAGenerateTransitions(ctxt, previous, (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) return(-1); previous = ctxt->state; @@ -5161,7 +5344,7 @@ xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { while ((ret != 0) && (ctxt->error == 0)) { ret = xmlFAParsePiece(ctxt); if (ret != 0) { - if (xmlFAGenerateTransitions(ctxt, previous, + if (xmlFAGenerateTransitions(ctxt, previous, (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) return(-1); previous = ctxt->state; @@ -5199,6 +5382,10 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { end = ctxt->state; while ((CUR == '|') && (ctxt->error == 0)) { NEXT; + if (CUR == 0) { + ERROR("expecting a branch after |") + return; + } ctxt->state = start; ctxt->end = NULL; xmlFAParseBranch(ctxt, end); @@ -5210,9 +5397,9 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { } /************************************************************************ - * * - * The basic API * - * * + * * + * The basic API * + * * ************************************************************************/ /** @@ -5281,6 +5468,10 @@ xmlRegexpCompile(const xmlChar *regexp) { if (CUR != 0) { ERROR("xmlFAParseRegExp: extra characters"); } + if (ctxt->error != 0) { + xmlRegFreeParserCtxt(ctxt); + return(NULL); + } ctxt->end = ctxt->state; ctxt->start->type = XML_REGEXP_START_STATE; ctxt->end->type = XML_REGEXP_FINAL_STATE; @@ -5345,10 +5536,12 @@ xmlRegexpIsDeterminist(xmlRegexpPtr comp) { am->nbStates = comp->nbStates; am->states = comp->states; am->determinist = -1; + am->flags = comp->flags; ret = xmlFAComputesDeterminism(am); am->atoms = NULL; am->states = NULL; xmlFreeAutomata(am); + comp->determinist = ret; return(ret); } @@ -5393,9 +5586,9 @@ xmlRegFreeRegexp(xmlRegexpPtr regexp) { #ifdef LIBXML_AUTOMATA_ENABLED /************************************************************************ - * * - * The Automata interface * - * * + * * + * The Automata interface * + * * ************************************************************************/ /** @@ -5426,6 +5619,7 @@ xmlNewAutomata(void) { xmlFreeAutomata(ctxt); return(NULL); } + ctxt->flags = 0; return(ctxt); } @@ -5444,6 +5638,20 @@ xmlFreeAutomata(xmlAutomataPtr am) { } /** + * xmlAutomataSetFlags: + * @am: an automata + * @flags: a set of internal flags + * + * Set some flags on the automata + */ +void +xmlAutomataSetFlags(xmlAutomataPtr am, int flags) { + if (am == NULL) + return; + am->flags |= flags; +} + +/** * xmlAutomataGetInitState: * @am: an automata * @@ -5501,8 +5709,6 @@ xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, if (atom == NULL) return(NULL); atom->data = data; - if (atom == NULL) - return(NULL); atom->valuep = xmlStrdup(token); if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { @@ -5651,7 +5857,7 @@ xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, * * If @to is NULL, this creates first a new target state in the automata * and then adds a transition from the @from state to the target state - * activated by a succession of input of value @token and @token2 and + * activated by a succession of input of value @token and @token2 and * whose number is between @min and @max * * Returns the target state or NULL in case of error @@ -5805,8 +6011,8 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, * * If @to is NULL, this creates first a new target state in the automata * and then adds a transition from the @from state to the target state - * activated by a succession of input of value @token and @token2 and whose - * number is between @min and @max, moreover that transition can only be + * activated by a succession of input of value @token and @token2 and whose + * number is between @min and @max, moreover that transition can only be * crossed once. * * Returns the target state or NULL in case of error @@ -5848,7 +6054,7 @@ xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, str[lenn + lenp + 1] = 0; atom->valuep = str; - } + } atom->data = data; atom->quant = XML_REGEXP_QUANT_ONCEONLY; atom->min = min; @@ -5871,7 +6077,7 @@ xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, return(to); } - + /** * xmlAutomataNewOnceTrans: @@ -5940,7 +6146,7 @@ xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, */ xmlAutomataStatePtr xmlAutomataNewState(xmlAutomataPtr am) { - xmlAutomataStatePtr to; + xmlAutomataStatePtr to; if (am == NULL) return(NULL); @@ -6007,7 +6213,7 @@ xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, * * Returns the counter number or -1 in case of error */ -int +int xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { int ret; @@ -6079,7 +6285,7 @@ xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, * * Returns the compiled regexp or NULL in case of error */ -xmlRegexpPtr +xmlRegexpPtr xmlAutomataCompile(xmlAutomataPtr am) { xmlRegexpPtr ret; @@ -6099,7 +6305,7 @@ xmlAutomataCompile(xmlAutomataPtr am) { * * Returns 1 if true, 0 if not, and -1 in case of error */ -int +int xmlAutomataIsDeterminist(xmlAutomataPtr am) { int ret; @@ -6129,6 +6335,7 @@ struct _xmlExpCtxt { int size; int nbElems; int nb_nodes; + int maxNodes; const char *expr; const char *cur; int nb_cons; @@ -6151,13 +6358,14 @@ xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { if (maxNodes <= 4096) maxNodes = 4096; - + ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); if (ret == NULL) return(NULL); memset(ret, 0, sizeof(xmlExpCtxt)); ret->size = size; ret->nbElems = 0; + ret->maxNodes = maxNodes; ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); if (ret->table == NULL) { xmlFree(ret); @@ -6204,7 +6412,7 @@ xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { /* #define DEBUG_DERIV */ /* - * TODO: + * TODO: * - Wildcards * - public API for creation * @@ -6272,7 +6480,7 @@ static unsigned short xmlExpHashNameComputeKey(const xmlChar *name) { unsigned short value = 0L; char ch; - + if (name != NULL) { value += 30 * (*name); while ((ch = *name++) != 0) { @@ -6291,7 +6499,7 @@ xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, xmlExpNodePtr right) { unsigned long value; unsigned short ret; - + switch (type) { case XML_EXP_SEQ: value = left->key; @@ -6432,7 +6640,7 @@ xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, left->exp_left->ref++; tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, NULL, 0, 0); - + xmlExpFree(ctxt, left); return(tmp); } @@ -6489,7 +6697,7 @@ xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, return(right); } kbase = xmlExpHashComputeKey(type, left, right); - } else + } else return(NULL); key = kbase % ctxt->size; @@ -6630,7 +6838,7 @@ xmlExpRef(xmlExpNodePtr exp) { * xmlExpNewAtom: * @ctxt: the expression context * @name: the atom name - * @len: the atom name lenght in byte (or -1); + * @len: the atom name length in byte (or -1); * * Get the atom associated to this name from that context * @@ -6730,7 +6938,7 @@ xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { ************************************************************************/ static int -xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, +xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar**list, int len, int nb) { int tmp, tmp2; tail: @@ -6743,7 +6951,7 @@ tail: return(0); if (nb >= len) return(-2); - list[nb++] = exp->exp_str; + list[nb] = exp->exp_str; return(1); case XML_EXP_COUNT: exp = exp->exp_left; @@ -6767,7 +6975,7 @@ tail: * @ctxt: the expression context * @exp: the expression * @langList: where to store the tokens - * @len: the allocated lenght of @list + * @len: the allocated length of @list * * Find all the strings used in @exp and store them in @list * @@ -6775,7 +6983,7 @@ tail: * -2 if there is more than @len strings */ int -xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, +xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar**langList, int len) { if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) return(-1); @@ -6783,7 +6991,7 @@ xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, } static int -xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, +xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar**list, int len, int nb) { int tmp, tmp2; tail: @@ -6798,7 +7006,7 @@ tail: return(0); if (nb >= len) return(-2); - list[nb++] = exp->exp_str; + list[nb] = exp->exp_str; return(1); case XML_EXP_COUNT: exp = exp->exp_left; @@ -6833,7 +7041,7 @@ tail: * @ctxt: the expression context * @exp: the expression * @tokList: where to store the tokens - * @len: the allocated lenght of @list + * @len: the allocated length of @list * * Find all the strings that appears at the start of the languages * accepted by @exp and store them in @list. E.g. for (a, b) | c @@ -6843,7 +7051,7 @@ tail: * -2 if there is more than @len strings */ int -xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, +xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar**tokList, int len) { if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) return(-1); @@ -7540,7 +7748,7 @@ xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { xmlFree((xmlChar **) tab); return(ret); } - + /** * xmlExpExpDerive: * @ctxt: the expressions context @@ -7592,7 +7800,7 @@ xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { int xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { xmlExpNodePtr tmp; - + if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) return(-1); @@ -7636,7 +7844,7 @@ xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { /************************************************************************ * * - * Parsing expression * + * Parsing expression * * * ************************************************************************/ @@ -7740,7 +7948,7 @@ parse_quantifier: ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 0, -1); SKIP_BLANKS - } + } return(ret); } @@ -7862,7 +8070,7 @@ xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { break; case XML_EXP_COUNT: { char rep[40]; - + c = expr->exp_left; if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) xmlExpDumpInt(buf, c, 1); |