From 99b3e852e553103ff75ed38117baa5bce2befbdb Mon Sep 17 00:00:00 2001 From: David Wagner Date: Thu, 19 Mar 2015 16:35:47 +0100 Subject: Re-implement the Tokenizer class from scratch It didn't have a license header. Even though we received explicit authorization to use it, it wasn't a comfortable situation. This is an original implementation that only keeps the APIs we use in the parameter-framework and keep them identical (except for a small exception). The behaviour is also exactly the same. Change-Id: I85a69c76027ee026a693d79cd19edd3b86796f9d Signed-off-by: David Wagner --- utility/Tokenizer.cpp | 200 +++++++++++++++++++------------------------------- utility/Tokenizer.h | 129 ++++++++++++++++++-------------- 2 files changed, 149 insertions(+), 180 deletions(-) (limited to 'utility') diff --git a/utility/Tokenizer.cpp b/utility/Tokenizer.cpp index 9ea4ea4..a4cfcf0 100644 --- a/utility/Tokenizer.cpp +++ b/utility/Tokenizer.cpp @@ -1,125 +1,75 @@ -/////////////////////////////////////////////////////////////////////////////// -// Tokenizer.cpp -// ============= -// General purpose string tokenizer (C++ string version) -// -// The default delimiters are space(" "), tab(\t, \v), newline(\n), -// carriage return(\r), and form feed(\f). -// If you want to use different delimiters, then use setDelimiter() to override -// the delimiters. Note that the delimiter string can hold multiple characters. -// -// AUTHOR: Song Ho Ahn (song.ahn@gmail.com) -// CREATED: 2005-05-25 -// UPDATED: 2011-03-08 -/////////////////////////////////////////////////////////////////////////////// - -#include "Tokenizer.h" - - -/////////////////////////////////////////////////////////////////////////////// -// constructor -/////////////////////////////////////////////////////////////////////////////// -Tokenizer::Tokenizer() : buffer(""), token(""), delimiter(DEFAULT_DELIMITER) -{ - currPos = buffer.begin(); -} - -Tokenizer::Tokenizer(const std::string& str, const std::string& delimiter) : buffer(str), token(""), delimiter(delimiter) -{ - currPos = buffer.begin(); -} - - - -/////////////////////////////////////////////////////////////////////////////// -// destructor -/////////////////////////////////////////////////////////////////////////////// -Tokenizer::~Tokenizer() -{ -} - - - -/////////////////////////////////////////////////////////////////////////////// -// reset string buffer, delimiter and the currsor position -/////////////////////////////////////////////////////////////////////////////// -void Tokenizer::set(const std::string& str, const std::string& delimiter) -{ - this->buffer = str; - this->delimiter = delimiter; - this->currPos = buffer.begin(); -} - -void Tokenizer::setString(const std::string& str) -{ - this->buffer = str; - this->currPos = buffer.begin(); -} - -void Tokenizer::setDelimiter(const std::string& delimiter) -{ - this->delimiter = delimiter; - this->currPos = buffer.begin(); -} - - - -/////////////////////////////////////////////////////////////////////////////// -// return the next token -// If cannot find a token anymore, return "". -/////////////////////////////////////////////////////////////////////////////// -std::string Tokenizer::next() -{ - if(buffer.size() <= 0) return ""; // skip if buffer is empty - - token.clear(); // reset token string - - this->skipDelimiter(); // skip leading delimiters - - // append each char to token string until it meets delimiter - while(currPos != buffer.end() && !isDelimiter(*currPos)) - { - token += *currPos; - ++currPos; - } - return token; -} - - - -/////////////////////////////////////////////////////////////////////////////// -// skip ang leading delimiters -/////////////////////////////////////////////////////////////////////////////// -void Tokenizer::skipDelimiter() -{ - while(currPos != buffer.end() && isDelimiter(*currPos)) - ++currPos; -} - - - -/////////////////////////////////////////////////////////////////////////////// -// return true if the current character is delimiter -/////////////////////////////////////////////////////////////////////////////// -bool Tokenizer::isDelimiter(char c) -{ - return (delimiter.find(c) != std::string::npos); -} - - - -/////////////////////////////////////////////////////////////////////////////// -// split the input string into multiple tokens -// This function scans tokens from the current cursor position. -/////////////////////////////////////////////////////////////////////////////// -std::vector Tokenizer::split() -{ - std::vector tokens; - std::string token; - while((token = this->next()) != "") - { - tokens.push_back(token); - } - - return tokens; -} +/* + * Copyright (c) 2015, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "Tokenizer.h" + +using std::string; +using std::vector; + +const string Tokenizer::defaultDelimiters = " \n\r\t\v\f"; + +Tokenizer::Tokenizer(const string &input, const string &delimiters) + : _input(input), _delimiters(delimiters), _position(0) +{ +} + +string Tokenizer::next() +{ + string token; + + // Skip all leading delimiters + string::size_type tokenStart = _input.find_first_not_of(_delimiters, _position); + + // Special case if there isn't any token anymore (string::substr's + // throws when pos==npos) + if (tokenStart == string::npos) { + return ""; + } + + // Starting from the token's start, find the first delimiter + string::size_type tokenEnd = _input.find_first_of(_delimiters, tokenStart); + + _position = tokenEnd; + + return _input.substr(tokenStart, tokenEnd - tokenStart); +} + +vector Tokenizer::split() +{ + vector result; + string token; + + while (true) { + token = next(); + if (token.empty()) { + return result; + } + result.push_back(token); + } +} diff --git a/utility/Tokenizer.h b/utility/Tokenizer.h index de3f86c..c48747a 100644 --- a/utility/Tokenizer.h +++ b/utility/Tokenizer.h @@ -1,56 +1,75 @@ -/////////////////////////////////////////////////////////////////////////////// -// Tokenizer.h -// =========== -// General purpose string tokenizer (C++ string version) -// -// The default delimiters are space(" "), tab(\t, \v), newline(\n), -// carriage return(\r), and form feed(\f). -// If you want to use different delimiters, then use setDelimiter() to override -// the delimiters. Note that the delimiter string can hold multiple characters. -// -// AUTHOR: Song Ho Ahn (song.ahn@gmail.com) -// CREATED: 2005-05-25 -// UPDATED: 2011-03-08 -/////////////////////////////////////////////////////////////////////////////// - -#ifndef TOKENIZER_H -#define TOKENIZER_H - -#include +/* + * Copyright (c) 2015, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#pragma once + +#include #include - -// default delimiter string (space, tab, newline, carriage return, form feed) -const std::string DEFAULT_DELIMITER = " \t\v\n\r\f"; - -class Tokenizer -{ -public: - // ctor/dtor - Tokenizer(); - Tokenizer(const std::string& str, const std::string& delimiter=DEFAULT_DELIMITER); - ~Tokenizer(); - - // set string and delimiter - void set(const std::string& str, const std::string& delimiter=DEFAULT_DELIMITER); - void setString(const std::string& str); // set source string only - void setDelimiter(const std::string& delimiter); // set delimiter string only - - std::string next(); // return the next token, return "" if it ends - - std::vector split(); // return array of tokens from current cursor - -protected: - - -private: - void skipDelimiter(); // ignore leading delimiters - bool isDelimiter(char c); // check if the current char is delimiter - - std::string buffer; // input string - std::string token; // output string - std::string delimiter; // delimiter string - std::string::const_iterator currPos; // string iterator pointing the current position - -}; - -#endif // TOKENIZER_H + +/** Tokenizer class + * + * Must be initialized with a string to be tokenized and, optionally, a string + * of delimiters (@see Tokenizer::defaultDelimiters). + * + * Multiple consecutive delimiters (even if different) are considered as a + * single one. As a result, there can't be empty tokens. + */ +class Tokenizer +{ +public: + /** Constructs a Tokenizer + * + * @param[in] input The string to be tokenized + * @param[in] delimiters A string containing all the token delimiters + * (hence, each delimiter can only be a single character) + */ + Tokenizer(const std::string &input, const std::string &delimiters=defaultDelimiters); + ~Tokenizer() {}; + + /** Return the next token or an empty string if no more token + * + * Multiple consecutive delimiters are considered as a single one - i.e. + * "a bc d " will be tokenized as ("a", "bc", "d") if the delimiter + * is ' '. + */ + std::string next(); + + /** Return a vector of all tokens + */ + std::vector split(); + + /** Default list of delimiters (" \n\r\t\v\f") */ + static const std::string defaultDelimiters; + +private: + const std::string _input; //< string to be tokenized + const std::string _delimiters; //< token delimiters + + std::string::size_type _position; //< end of the last returned token +}; -- cgit v1.1