utility/Tokenizer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

/*
 * Copyright (c) 2015, Intel Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation and/or
 * other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <string>
#include <vector>

/** Tokenizer class
 *
 * Must be initialized with a string to be tokenized and, optionally, a string
 * of delimiters (@see Tokenizer::defaultDelimiters).
 *
 * Multiple consecutive delimiters (even if different) are considered as a
 * single one. As a result, there can't be empty tokens.
 */
class Tokenizer
{
public:
    /** Constructs a Tokenizer
     *
     * @param[in] input The string to be tokenized
     * @param[in] delimiters A string containing all the token delimiters
     *            (hence, each delimiter can only be a single character)
     */
    Tokenizer(const std::string &input, const std::string &delimiters=defaultDelimiters);
    ~Tokenizer() {};

    /** Return the next token or an empty string if no more token
     *
     * Multiple consecutive delimiters are considered as a single one - i.e.
     * "a     bc d   " will be tokenized as ("a", "bc", "d") if the delimiter
     * is ' '.
     */
    std::string next();

    /** Return a vector of all tokens
     */
    std::vector<std::string> split();

    /** Default list of delimiters (" \n\r\t\v\f") */
    static const std::string defaultDelimiters;

private:
    const std::string _input; //< string to be tokenized
    const std::string _delimiters; //< token delimiters

    std::string::size_type _position; //< end of the last returned token
};