00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include <iostream>
00015
00016 #include <boost/regex.hpp>
00017
00018 #include "foreach.hpp"
00019 #include "formula_tokenizer.hpp"
00020
00021 namespace formula_tokenizer
00022 {
00023
00024 namespace {
00025
00026 using boost::regex;
00027
00028 struct token_type {
00029 regex re;
00030 TOKEN_TYPE type;
00031 };
00032
00033
00034 token_type token_types[] = { { regex("^(not\\b|and\\b|or\\b|where\\b|d(?=[^a-zA-Z])|\\*|\\+|-(?=[^>])|\\^|%|/|<=|>=|<|>|!=|=|\\.)"), TOKEN_OPERATOR },
00035 { regex("^functions\\b"), TOKEN_KEYWORD },
00036 { regex("^def\\b"), TOKEN_KEYWORD },
00037 { regex("^'[^']*'"), TOKEN_STRING_LITERAL },
00038 { regex("^[a-zA-Z_]+"), TOKEN_IDENTIFIER },
00039 { regex("^\\d+"), TOKEN_INTEGER },
00040 { regex("^\\("), TOKEN_LPARENS },
00041 { regex("^\\)"), TOKEN_RPARENS },
00042 { regex("^\\["), TOKEN_LSQUARE },
00043 { regex("^\\]"), TOKEN_RSQUARE },
00044 { regex("^\\{"), TOKEN_LBRACKET },
00045 { regex("^\\}"), TOKEN_RBRACKET },
00046 { regex("^#.*?#"), TOKEN_COMMENT },
00047 { regex("^,"), TOKEN_COMMA },
00048 { regex("^;"), TOKEN_SEMICOLON },
00049 { regex("^\\s+"), TOKEN_WHITESPACE },
00050 { regex("^->"), TOKEN_POINTER }
00051 };
00052
00053 }
00054
00055 token get_token(iterator& i1, iterator i2) {
00056 foreach(const token_type& t, token_types) {
00057 boost::smatch match;
00058 if(boost::regex_search(i1, i2, match, t.re, boost::match_single_line)) {
00059 token res;
00060 res.type = t.type;
00061 res.begin = i1;
00062 i1 = res.end = i1 + match.length();
00063
00064 return res;
00065 }
00066 }
00067
00068 std::cerr << "Unrecognized token: '" << std::string(i1,i2) << "'\n";
00069 throw token_error();
00070 }
00071
00072 }
00073
00074 #ifdef UNIT_TEST_TOKENIZER
00075
00076 int main()
00077 {
00078 using namespace formula_tokenizer;
00079 std::string test = "(abc + 4 * (5+3))^2";
00080 std::string::const_iterator i1 = test.begin();
00081 std::string::const_iterator i2 = test.end();
00082 TOKEN_TYPE types[] = {TOKEN_LPARENS, TOKEN_IDENTIFIER,
00083 TOKEN_WHITESPACE, TOKEN_OPERATOR,
00084 TOKEN_WHITESPACE, TOKEN_INTEGER,
00085 TOKEN_WHITESPACE, TOKEN_OPERATOR,
00086 TOKEN_WHITESPACE, TOKEN_LPARENS,
00087 TOKEN_INTEGER, TOKEN_OPERATOR,
00088 TOKEN_INTEGER, TOKEN_RPARENS,
00089 TOKEN_RPARENS, TOKEN_KEYWORD,
00090 TOKEN_OPERATOR, TOKEN_INTEGER};
00091 std::string tokens[] = {"(", "abc", " ", "+", " ", "4", " ",
00092 "*", " ", "(", "5", "+", "3", ")", ")", "functions"};
00093 for(int n = 0; n != sizeof(types)/sizeof(*types); ++n) {
00094 token t = get_token(i1,i2);
00095 assert(std::string(t.begin,t.end) == tokens[n]);
00096 assert(t.type == types[n]);
00097
00098 }
00099 }
00100
00101 #endif