formula_tokenizer.cpp

Go to the documentation of this file.
00001 /* $Id: formula_tokenizer.cpp 52533 2012-01-07 02:35:17Z shadowmaster $ */
00002 /*
00003    Copyright (C) 2007 - 2012 by David White <dave.net>
00004    Part of the Silver Tree Project
00005 
00006    This program is free software; you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License as published by or later.
00008    This program is distributed in the hope that it will be useful,
00009    but WITHOUT ANY WARRANTY.
00010 
00011    See the COPYING file for more details.
00012 */
00013 
00014 #include <sstream>
00015 
00016 #include "foreach.hpp"
00017 #include "formula_tokenizer.hpp"
00018 
00019 namespace formula_tokenizer
00020 {
00021 
00022 namespace {
00023 
00024 void raise_exception(iterator& i1, iterator i2, std::string str) {
00025     std::ostringstream expr;
00026     while( (i1 != i2) && (*i1 != '\n') ) {
00027         if( (*i1 != '\t') )
00028             expr << *i1;
00029         ++i1;
00030     }
00031 
00032     if( str.empty() )
00033         throw token_error("Unrecognized token", expr.str() );
00034     else
00035         throw token_error(str, expr.str() );
00036 }
00037 
00038 }
00039 
00040 token get_token(iterator& i1, iterator i2) {
00041 
00042     iterator it = i1;
00043     if( *i1 >= 'A' ) {
00044         //current character is >= 'A', limit search to the upper-half of the ASCII table
00045 
00046         // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
00047         if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
00048 
00049             while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
00050                 ++i1;
00051 
00052             int diff = i1 - it;
00053             TOKEN_TYPE t = TOKEN_IDENTIFIER;
00054 
00055             //check if this string matches any keyword or an operator
00056             //possible opearators and keywords:
00057             // d, or, def, and, not, fai, where, faiend, functions
00058             if( diff == 1 ) {
00059                 if( *it == 'd' )
00060                     t = TOKEN_OPERATOR;
00061             } else if( diff == 2 ) {
00062                 if( *it == 'o' && *(it+1) == 'r' )
00063                     t = TOKEN_OPERATOR;
00064             } else if( diff == 3 ) {
00065                 if( *it == 'd' ) { //def
00066                     if( *(it+1) == 'e' && *(it+2) == 'f' )
00067                         t = TOKEN_KEYWORD;
00068                 } else if( *it == 'a' ) { //and
00069                     if( *(it+1) == 'n' && *(it+2) == 'd' )
00070                         t = TOKEN_OPERATOR;
00071                 } else if( *it == 'n' ) { //not
00072                     if( *(it+1) == 'o' && *(it+2) == 't' )
00073                         t = TOKEN_OPERATOR;
00074                 } else if( *it == 'f' ) { //fai
00075                     if( *(it+1) == 'a' && *(it+2) == 'i' )
00076                         t = TOKEN_KEYWORD;
00077                 }
00078             } else if( diff == 5 ) {
00079                 std::string s(it, i1);
00080                 if( s == "where" )
00081                     t = TOKEN_OPERATOR;
00082             } else if( diff == 6 ) {
00083                 std::string s(it, i1);
00084                 if( s == "faiend" )
00085                     t = TOKEN_KEYWORD;
00086             } else if( diff == 9 ) {
00087                 std::string s(it, i1);
00088                 if( s == "functions" )
00089                     t = TOKEN_KEYWORD;
00090             }
00091 
00092             return token( it, i1, t);
00093         } else {
00094             //at this point only 3 chars left to check:
00095             if( *i1 == '[' )
00096                 return token( it, ++i1, TOKEN_LSQUARE );
00097 
00098             if( *i1 == ']' )
00099                 return token( it, ++i1, TOKEN_RSQUARE );
00100 
00101             if( *i1 == '^' )
00102                 return token( it, ++i1, TOKEN_OPERATOR );
00103 
00104         }
00105     } else {
00106         //limit search to the lower-half of the ASCII table
00107         //start by checking for whitespaces/end of line char
00108         if( *i1 <= ' ' ) {
00109             if( *i1 == '\n' ) {
00110                 return token( it, ++i1, TOKEN_EOL);
00111             } else {
00112 
00113                 while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
00114                     ++i1;
00115 
00116                 return token( it, i1, TOKEN_WHITESPACE );
00117             }
00118         //try to further limit number of characters that we need to check:
00119         } else if ( *i1 >= '0' ){
00120             //current character is between '0' and '@'
00121             if( *i1 <= '9' ) {
00122                 //we parse integer or decimal number
00123                 ++i1;
00124                 bool dot = false;
00125 
00126                 while( i1 != i2 ) {
00127                     if( *i1 >= '0' && *i1 <= '9' ) {
00128                         //do nothing
00129                     } else {
00130                         //look for '.' in case of decimal numer
00131                         if( *i1 == '.' ) {
00132                             //allow only one dot in such expression
00133                             if( !dot )
00134                                 dot = true;
00135                             else
00136                                 raise_exception(it, i2, "Multiple dots near decimal expression");
00137                         } else
00138                             break;
00139                     }
00140                     ++i1;
00141                 }
00142 
00143                 if( dot )
00144                     return token( it, i1, TOKEN_DECIMAL );
00145                 else
00146                     return token( it, i1, TOKEN_INTEGER );
00147 
00148             } else {
00149                 //current character is between ':' and '@'
00150                 //possible tokens at this point that we are intersted with:
00151                 // ; < = > <= >=
00152 
00153                 if( *i1 == ';' ) {
00154                     return token( it, ++i1, TOKEN_SEMICOLON);
00155                 } else if( *i1 == '=' ) {
00156                     return token( it, ++i1, TOKEN_OPERATOR);
00157                 } else if( *i1 == '<' ) {
00158                     ++i1;
00159                     if( i1 != i2 ) {
00160                         if( *i1 == '=' )
00161                             return token( it, ++i1, TOKEN_OPERATOR);
00162                         else
00163                             return token( it, i1, TOKEN_OPERATOR);
00164                     } else
00165                         return token( it, i1, TOKEN_OPERATOR);
00166                 } else if( *i1 == '>' ) {
00167                     ++i1;
00168                     if( i1 != i2 ) {
00169                         if( *i1 == '=' )
00170                             return token( it, ++i1, TOKEN_OPERATOR);
00171                         else
00172                             return token( it, i1, TOKEN_OPERATOR);
00173                     } else
00174                         return token( it, i1, TOKEN_OPERATOR);
00175                 }
00176             }
00177         //current character is between '!' and '/'
00178         } else if ( *i1 == ',' ) {
00179             return token( it, ++i1, TOKEN_COMMA);
00180 
00181         } else if ( *i1 == '.' ) {
00182             ++i1;
00183 
00184             if( i1 != i2 ) {
00185                 if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/')
00186                     return token( it, ++i1, TOKEN_OPERATOR );
00187                 else
00188                     return token( it, i1, TOKEN_OPERATOR );
00189             } else {
00190                 return token( it, i1, TOKEN_OPERATOR);
00191             }
00192 
00193         } else if ( *i1 == '(' ) {
00194             return token( it, ++i1, TOKEN_LPARENS);
00195 
00196         } else if ( *i1 == ')' ) {
00197             return token( it, ++i1, TOKEN_RPARENS);
00198 
00199         } else if ( *i1 == '\'' ) {
00200             ++i1;
00201             while( i1 != i2 && *i1 != '\'' )
00202                 ++i1;
00203 
00204             if( i1 != i2 ) {
00205                 return token( it, ++i1, TOKEN_STRING_LITERAL );
00206             } else {
00207                 raise_exception(it, i2, "Missing closing ' for formula string");
00208             }
00209 
00210         } else if ( *i1 == '#' ) {
00211             ++i1;
00212             while( i1 != i2 && *i1 != '#' )
00213                 ++i1;
00214 
00215             if( i1 != i2 ) {
00216                 return token( it, ++i1, TOKEN_COMMENT );
00217             } else {
00218                 raise_exception(it, i2, "Missing closing # for formula comment");
00219             }
00220 
00221         } else if ( *i1 == '+' ) {
00222             return token( it, ++i1, TOKEN_OPERATOR);
00223 
00224         } else if ( *i1 == '-' ) {
00225             ++i1;
00226 
00227             if( i1 != i2 ) {
00228                 if( *i1 == '>' )
00229                     return token( it, ++i1, TOKEN_POINTER );
00230                 else
00231                     return token( it, i1, TOKEN_OPERATOR );
00232             } else {
00233                 return token( it, i1, TOKEN_OPERATOR);
00234             }
00235 
00236         } else if ( *i1 == '*' ) {
00237             return token( it, ++i1, TOKEN_OPERATOR);
00238 
00239         } else if ( *i1 == '/' ) {
00240             return token( it, ++i1, TOKEN_OPERATOR);
00241 
00242         } else if ( *i1 == '%' ) {
00243             return token( it, ++i1, TOKEN_OPERATOR);
00244 
00245         } else if ( *i1 == '!' ) {
00246             ++i1;
00247             if( *i1 == '=' )
00248                 return token( it, ++i1, TOKEN_OPERATOR);
00249             else
00250                 raise_exception(it, i2, std::string() );
00251         }
00252     }
00253     raise_exception(it, i2, std::string() );
00254     return token();
00255 }
00256 
00257 }
00258 
00259 #ifdef UNIT_TEST_TOKENIZER
00260 
00261 int main()
00262 {
00263     using namespace formula_tokenizer;
00264     std::string test = "(abc + 4 * (5+3))^2";
00265     std::string::const_iterator i1 = test.begin();
00266     std::string::const_iterator i2 = test.end();
00267     TOKEN_TYPE types[] = {TOKEN_LPARENS, TOKEN_IDENTIFIER,
00268                           TOKEN_WHITESPACE, TOKEN_OPERATOR,
00269                           TOKEN_WHITESPACE, TOKEN_INTEGER,
00270                           TOKEN_WHITESPACE, TOKEN_OPERATOR,
00271                           TOKEN_WHITESPACE, TOKEN_LPARENS,
00272                           TOKEN_INTEGER, TOKEN_OPERATOR,
00273                           TOKEN_INTEGER, TOKEN_RPARENS,
00274                           TOKEN_RPARENS, TOKEN_KEYWORD,
00275                           TOKEN_OPERATOR, TOKEN_INTEGER};
00276     std::string tokens[] = {"(", "abc", " ", "+", " ", "4", " ",
00277                             "*", " ", "(", "5", "+", "3", ")", ")", "functions"};
00278     for(int n = 0; n != sizeof(types)/sizeof(*types); ++n) {
00279         token t = get_token(i1,i2);
00280         assert(std::string(t.begin,t.end) == tokens[n]);
00281         assert(t.type == types[n]);
00282 
00283     }
00284     return 0;
00285 }
00286 
00287 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

Generated by doxygen 1.7.1 on Fri May 25 2012 01:02:51 for The Battle for Wesnoth
Gna! | Forum | Wiki | CIA | devdocs