00001 /* $Id: tokenizer.cpp 23842 2008-02-16 08:47:16Z mordante $ */ 00002 /* 00003 Copyright (C) 2004 - 2008 by Philippe Plantier <ayin@anathas.org> 00004 Part of the Battle for Wesnoth Project http://www.wesnoth.org 00005 00006 This program is free software; you can redistribute it and/or modify 00007 it under the terms of the GNU General Public License version 2 00008 or at your option any later version. 00009 This program is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY. 00011 00012 See the COPYING file for more details. 00013 */ 00014 00015 //! @file serialization/tokenizer.cpp 00016 //! 00017 00018 #include "global.hpp" 00019 00020 #include "util.hpp" 00021 #include "serialization/tokenizer.hpp" 00022 #include "serialization/string_utils.hpp" 00023 00024 #include <iostream> 00025 #include <sstream> 00026 00027 tokenizer::tokenizer(std::istream& in) : 00028 current_(EOF), 00029 lineno_(1), 00030 textdomain_(), 00031 file_(), 00032 tokenstart_lineno_(), 00033 token_(), 00034 in_(in) 00035 { 00036 next_char_fast(); 00037 } 00038 00039 void tokenizer::skip_comment() 00040 { 00041 next_char_fast(); 00042 if(current_ != '\n' && current_ != EOF) { 00043 if(current_ == 't') { 00044 // When the string 'textdomain[ |\t] is matched the rest of the line is 00045 // the textdomain to switch to. If we at any point fail to match we break 00046 // out of the loop and eat the rest of the line without testing. 00047 size_t i = 0; 00048 static const std::string match = "extdomain"; 00049 this->next_char_fast(); 00050 while(current_ != '\n' && current_ != EOF) { 00051 if(i < 9) { 00052 if(current_ != match[i]) { 00053 break; 00054 } 00055 ++i; 00056 } else if(i == 9) { 00057 if(current_ != ' ' && current_ != '\t') { 00058 break; 00059 } 00060 ++i; 00061 textdomain_ = ""; 00062 } else { 00063 textdomain_ += current_; 00064 } 00065 this->next_char_fast(); 00066 } 00067 while(current_ != '\n' && current_ != EOF) { 00068 this->next_char_fast(); 00069 } 00070 00071 } else if(current_ == 'l') { 00072 // Basically the same as textdomain but we match 'line[ |\t]d*[ |\t]s* 00073 // d* is the line number 00074 // s* is the file name 00075 // It inherited the * instead of + from the previous implementation. 00076 size_t i = 0; 00077 static const std::string match = "ine"; 00078 this->next_char_fast(); 00079 bool found = false; 00080 std::string lineno; 00081 while(current_ != '\n' && current_ != EOF) { 00082 if(i < 3) { 00083 if(current_ != match[i]) { 00084 break; 00085 } 00086 ++i; 00087 } else if(i == 3) { 00088 if(current_ != ' ' && current_ != '\t') { 00089 break; 00090 } 00091 ++i; 00092 } else { 00093 if(!found) { 00094 if(current_ == ' ' || current_ == '\t') { 00095 found = true; 00096 lineno_ = lexical_cast<size_t>(lineno); 00097 file_ = ""; 00098 } else { 00099 lineno += current_; 00100 } 00101 } else { 00102 file_ += current_; 00103 } 00104 } 00105 this->next_char_fast(); 00106 } 00107 while(current_ != '\n' && current_ != EOF) { 00108 this->next_char_fast(); 00109 } 00110 } else { 00111 // Neither a textdomain or line comment skip it. 00112 while(current_ != '\n' && current_ != EOF) { 00113 this->next_char_fast(); 00114 } 00115 } 00116 } 00117 } 00118 00119 const token& tokenizer::next_token() 00120 { 00121 token_.value = ""; 00122 token_.leading_spaces = ""; 00123 00124 // Dump spaces and inlined comments 00125 for(;;) { 00126 while (is_space(current_)) { 00127 token_.leading_spaces += current_; 00128 next_char_fast(); 00129 } 00130 if (current_ != 254) 00131 break; 00132 skip_comment(); 00133 --lineno_; 00134 next_char(); 00135 } 00136 00137 if (current_ == '#') 00138 skip_comment(); 00139 00140 tokenstart_lineno_ = lineno_; 00141 00142 switch(current_) { 00143 case EOF: 00144 token_.type = token::END; 00145 break; 00146 case '"': 00147 token_.type = token::QSTRING; 00148 while (1) { 00149 next_char(); 00150 00151 if(current_ == EOF) { 00152 token_.type = token::UNTERMINATED_QSTRING; 00153 break; 00154 } 00155 if(current_ == '"' && peek_char() != '"') 00156 break; 00157 if(current_ == '"' && peek_char() == '"') 00158 next_char_fast(); 00159 if (current_ == 254) { 00160 skip_comment(); 00161 --lineno_; 00162 continue; 00163 } 00164 00165 token_.value += current_; 00166 }; 00167 break; 00168 case '[': case ']': case '/': case '\n': case '=': case ',': case '+': 00169 token_.type = token::token_type(current_); 00170 token_.value = current_; 00171 break; 00172 default: 00173 if(is_alnum(current_)) { 00174 token_.type = token::STRING; 00175 token_.value += current_; 00176 while(is_alnum(peek_char())) { 00177 next_char_fast(); 00178 token_.value += current_; 00179 } 00180 } else { 00181 token_.type = token::MISC; 00182 token_.value += current_; 00183 } 00184 if(token_.value == "_") 00185 token_.type = token::token_type('_'); 00186 } 00187 00188 if(current_ != EOF) 00189 next_char(); 00190 00191 return token_; 00192 } 00193 00194 const token& tokenizer::current_token() const 00195 { 00196 return token_; 00197 } 00198 00199 00200 bool tokenizer::is_space(const int c) const 00201 { 00202 return c == ' ' || c == '\t'; 00203 } 00204 00205 bool tokenizer::is_alnum(const int c) const 00206 { 00207 return (c >= 'a' && c <= 'z') 00208 || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; 00209 } 00210 00211 std::string tokenizer::get_line() const 00212 { 00213 std::ostringstream s; 00214 s << tokenstart_lineno_ << ' ' << file_; 00215 return s.str(); 00216 } 00217 00218 std::string& tokenizer::textdomain() 00219 { 00220 return textdomain_; 00221 } 00222 00223 00224
Generated by doxygen 1.5.5 on 23 May 2008 for The Battle for Wesnoth | Gna! | Forum | Wiki | CIA | devdocs |