00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include <sstream>
00015
00016 #include "foreach.hpp"
00017 #include "formula_tokenizer.hpp"
00018
00019 namespace formula_tokenizer
00020 {
00021
00022 namespace {
00023
00024 void raise_exception(iterator& i1, iterator i2, std::string str) {
00025 std::ostringstream expr;
00026 while( (i1 != i2) && (*i1 != '\n') ) {
00027 if( (*i1 != '\t') )
00028 expr << *i1;
00029 ++i1;
00030 }
00031
00032 if( str.empty() )
00033 throw token_error("Unrecognized token", expr.str() );
00034 else
00035 throw token_error(str, expr.str() );
00036 }
00037
00038 }
00039
00040 token get_token(iterator& i1, iterator i2) {
00041
00042 iterator it = i1;
00043 if( *i1 >= 'A' ) {
00044
00045
00046
00047 if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
00048
00049 while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
00050 ++i1;
00051
00052 int diff = i1 - it;
00053 TOKEN_TYPE t = TOKEN_IDENTIFIER;
00054
00055
00056
00057
00058 if( diff == 1 ) {
00059 if( *it == 'd' )
00060 t = TOKEN_OPERATOR;
00061 } else if( diff == 2 ) {
00062 if( *it == 'o' && *(it+1) == 'r' )
00063 t = TOKEN_OPERATOR;
00064 } else if( diff == 3 ) {
00065 if( *it == 'd' ) {
00066 if( *(it+1) == 'e' && *(it+2) == 'f' )
00067 t = TOKEN_KEYWORD;
00068 } else if( *it == 'a' ) {
00069 if( *(it+1) == 'n' && *(it+2) == 'd' )
00070 t = TOKEN_OPERATOR;
00071 } else if( *it == 'n' ) {
00072 if( *(it+1) == 'o' && *(it+2) == 't' )
00073 t = TOKEN_OPERATOR;
00074 } else if( *it == 'f' ) {
00075 if( *(it+1) == 'a' && *(it+2) == 'i' )
00076 t = TOKEN_KEYWORD;
00077 }
00078 } else if( diff == 5 ) {
00079 std::string s(it, i1);
00080 if( s == "where" )
00081 t = TOKEN_OPERATOR;
00082 } else if( diff == 6 ) {
00083 std::string s(it, i1);
00084 if( s == "faiend" )
00085 t = TOKEN_KEYWORD;
00086 } else if( diff == 9 ) {
00087 std::string s(it, i1);
00088 if( s == "functions" )
00089 t = TOKEN_KEYWORD;
00090 }
00091
00092 return token( it, i1, t);
00093 } else {
00094
00095 if( *i1 == '[' )
00096 return token( it, ++i1, TOKEN_LSQUARE );
00097
00098 if( *i1 == ']' )
00099 return token( it, ++i1, TOKEN_RSQUARE );
00100
00101 if( *i1 == '^' )
00102 return token( it, ++i1, TOKEN_OPERATOR );
00103
00104 }
00105 } else {
00106
00107
00108 if( *i1 <= ' ' ) {
00109 if( *i1 == '\n' ) {
00110 return token( it, ++i1, TOKEN_EOL);
00111 } else {
00112
00113 while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
00114 ++i1;
00115
00116 return token( it, i1, TOKEN_WHITESPACE );
00117 }
00118
00119 } else if ( *i1 >= '0' ){
00120
00121 if( *i1 <= '9' ) {
00122
00123 ++i1;
00124 bool dot = false;
00125
00126 while( i1 != i2 ) {
00127 if( *i1 >= '0' && *i1 <= '9' ) {
00128
00129 } else {
00130
00131 if( *i1 == '.' ) {
00132
00133 if( !dot )
00134 dot = true;
00135 else
00136 raise_exception(it, i2, "Multiple dots near decimal expression");
00137 } else
00138 break;
00139 }
00140 ++i1;
00141 }
00142
00143 if( dot )
00144 return token( it, i1, TOKEN_DECIMAL );
00145 else
00146 return token( it, i1, TOKEN_INTEGER );
00147
00148 } else {
00149
00150
00151
00152
00153 if( *i1 == ';' ) {
00154 return token( it, ++i1, TOKEN_SEMICOLON);
00155 } else if( *i1 == '=' ) {
00156 return token( it, ++i1, TOKEN_OPERATOR);
00157 } else if( *i1 == '<' ) {
00158 ++i1;
00159 if( i1 != i2 ) {
00160 if( *i1 == '=' )
00161 return token( it, ++i1, TOKEN_OPERATOR);
00162 else
00163 return token( it, i1, TOKEN_OPERATOR);
00164 } else
00165 return token( it, i1, TOKEN_OPERATOR);
00166 } else if( *i1 == '>' ) {
00167 ++i1;
00168 if( i1 != i2 ) {
00169 if( *i1 == '=' )
00170 return token( it, ++i1, TOKEN_OPERATOR);
00171 else
00172 return token( it, i1, TOKEN_OPERATOR);
00173 } else
00174 return token( it, i1, TOKEN_OPERATOR);
00175 }
00176 }
00177
00178 } else if ( *i1 == ',' ) {
00179 return token( it, ++i1, TOKEN_COMMA);
00180
00181 } else if ( *i1 == '.' ) {
00182 ++i1;
00183
00184 if( i1 != i2 ) {
00185 if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/')
00186 return token( it, ++i1, TOKEN_OPERATOR );
00187 else
00188 return token( it, i1, TOKEN_OPERATOR );
00189 } else {
00190 return token( it, i1, TOKEN_OPERATOR);
00191 }
00192
00193 } else if ( *i1 == '(' ) {
00194 return token( it, ++i1, TOKEN_LPARENS);
00195
00196 } else if ( *i1 == ')' ) {
00197 return token( it, ++i1, TOKEN_RPARENS);
00198
00199 } else if ( *i1 == '\'' ) {
00200 ++i1;
00201 while( i1 != i2 && *i1 != '\'' )
00202 ++i1;
00203
00204 if( i1 != i2 ) {
00205 return token( it, ++i1, TOKEN_STRING_LITERAL );
00206 } else {
00207 raise_exception(it, i2, "Missing closing ' for formula string");
00208 }
00209
00210 } else if ( *i1 == '#' ) {
00211 ++i1;
00212 while( i1 != i2 && *i1 != '#' )
00213 ++i1;
00214
00215 if( i1 != i2 ) {
00216 return token( it, ++i1, TOKEN_COMMENT );
00217 } else {
00218 raise_exception(it, i2, "Missing closing # for formula comment");
00219 }
00220
00221 } else if ( *i1 == '+' ) {
00222 return token( it, ++i1, TOKEN_OPERATOR);
00223
00224 } else if ( *i1 == '-' ) {
00225 ++i1;
00226
00227 if( i1 != i2 ) {
00228 if( *i1 == '>' )
00229 return token( it, ++i1, TOKEN_POINTER );
00230 else
00231 return token( it, i1, TOKEN_OPERATOR );
00232 } else {
00233 return token( it, i1, TOKEN_OPERATOR);
00234 }
00235
00236 } else if ( *i1 == '*' ) {
00237 return token( it, ++i1, TOKEN_OPERATOR);
00238
00239 } else if ( *i1 == '/' ) {
00240 return token( it, ++i1, TOKEN_OPERATOR);
00241
00242 } else if ( *i1 == '%' ) {
00243 return token( it, ++i1, TOKEN_OPERATOR);
00244
00245 } else if ( *i1 == '!' ) {
00246 ++i1;
00247 if( *i1 == '=' )
00248 return token( it, ++i1, TOKEN_OPERATOR);
00249 else
00250 raise_exception(it, i2, std::string() );
00251 }
00252 }
00253 raise_exception(it, i2, std::string() );
00254 return token();
00255 }
00256
00257 }
00258
00259 #ifdef UNIT_TEST_TOKENIZER
00260
00261 int main()
00262 {
00263 using namespace formula_tokenizer;
00264 std::string test = "(abc + 4 * (5+3))^2";
00265 std::string::const_iterator i1 = test.begin();
00266 std::string::const_iterator i2 = test.end();
00267 TOKEN_TYPE types[] = {TOKEN_LPARENS, TOKEN_IDENTIFIER,
00268 TOKEN_WHITESPACE, TOKEN_OPERATOR,
00269 TOKEN_WHITESPACE, TOKEN_INTEGER,
00270 TOKEN_WHITESPACE, TOKEN_OPERATOR,
00271 TOKEN_WHITESPACE, TOKEN_LPARENS,
00272 TOKEN_INTEGER, TOKEN_OPERATOR,
00273 TOKEN_INTEGER, TOKEN_RPARENS,
00274 TOKEN_RPARENS, TOKEN_KEYWORD,
00275 TOKEN_OPERATOR, TOKEN_INTEGER};
00276 std::string tokens[] = {"(", "abc", " ", "+", " ", "4", " ",
00277 "*", " ", "(", "5", "+", "3", ")", ")", "functions"};
00278 for(int n = 0; n != sizeof(types)/sizeof(*types); ++n) {
00279 token t = get_token(i1,i2);
00280 assert(std::string(t.begin,t.end) == tokens[n]);
00281 assert(t.type == types[n]);
00282
00283 }
00284 return 0;
00285 }
00286
00287 #endif