tokenizer.cpp

Go to the documentation of this file.
00001 /* $Id: tokenizer.cpp 23842 2008-02-16 08:47:16Z mordante $ */
00002 /*
00003    Copyright (C) 2004 - 2008 by Philippe Plantier <ayin@anathas.org>
00004    Part of the Battle for Wesnoth Project http://www.wesnoth.org
00005 
00006    This program is free software; you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License version 2
00008    or at your option any later version.
00009    This program is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY.
00011 
00012    See the COPYING file for more details.
00013 */
00014 
00015 //! @file serialization/tokenizer.cpp 
00016 //!
00017 
00018 #include "global.hpp"
00019 
00020 #include "util.hpp"
00021 #include "serialization/tokenizer.hpp"
00022 #include "serialization/string_utils.hpp"
00023 
00024 #include <iostream>
00025 #include <sstream>
00026 
00027 tokenizer::tokenizer(std::istream& in) :
00028     current_(EOF),
00029     lineno_(1),
00030     textdomain_(),
00031     file_(),
00032     tokenstart_lineno_(),
00033     token_(),
00034     in_(in)
00035 {
00036     next_char_fast();
00037 }
00038 
00039 void tokenizer::skip_comment()
00040 {
00041     next_char_fast();
00042     if(current_ != '\n' && current_ != EOF) {
00043         if(current_ == 't') {
00044             // When the string 'textdomain[ |\t] is matched the rest of the line is
00045             // the textdomain to switch to. If we at any point fail to match we break
00046             // out of the loop and eat the rest of the line without testing.
00047             size_t i = 0;
00048             static const std::string match = "extdomain";
00049             this->next_char_fast();
00050             while(current_ != '\n' && current_ != EOF) {
00051                 if(i < 9) {
00052                     if(current_ != match[i]) {
00053                         break;
00054                     }
00055                     ++i;
00056                 } else if(i == 9) { 
00057                     if(current_ != ' ' && current_ != '\t') {
00058                         break;
00059                     }
00060                     ++i;
00061                     textdomain_ = "";
00062                 } else {
00063                     textdomain_ += current_;
00064                 }
00065                 this->next_char_fast();
00066             }
00067             while(current_ != '\n' && current_ != EOF) {
00068                 this->next_char_fast();
00069             }
00070 
00071         } else if(current_ == 'l') {
00072             // Basically the same as textdomain but we match 'line[ |\t]d*[ |\t]s*
00073             // d* is the line number 
00074             // s* is the file name
00075             // It inherited the * instead of + from the previous implementation.
00076             size_t i = 0;
00077             static const std::string match = "ine";
00078             this->next_char_fast();
00079             bool found = false;
00080             std::string lineno;
00081             while(current_ != '\n' && current_ != EOF) {
00082                 if(i < 3) {
00083                     if(current_ != match[i]) {
00084                         break;
00085                     }
00086                     ++i;
00087                 } else if(i == 3) { 
00088                     if(current_ != ' ' && current_ != '\t') {
00089                         break;
00090                     }
00091                     ++i;
00092                 } else {
00093                     if(!found) {
00094                         if(current_ == ' ' || current_ == '\t') {
00095                             found = true;
00096                             lineno_ = lexical_cast<size_t>(lineno);
00097                             file_ = "";
00098                         } else {
00099                             lineno += current_;
00100                         }
00101                     } else {
00102                         file_ += current_;
00103                     }
00104                 }
00105                 this->next_char_fast();
00106             }
00107             while(current_ != '\n' && current_ != EOF) {
00108                 this->next_char_fast();
00109             }
00110         } else {
00111             // Neither a textdomain or line comment skip it.
00112             while(current_ != '\n' && current_ != EOF) {
00113                 this->next_char_fast();
00114             }
00115         }
00116     }
00117 }
00118 
00119 const token& tokenizer::next_token()
00120 {
00121     token_.value = "";
00122     token_.leading_spaces = "";
00123 
00124     // Dump spaces and inlined comments
00125     for(;;) {
00126         while (is_space(current_)) {
00127             token_.leading_spaces += current_;
00128             next_char_fast();
00129         }
00130         if (current_ != 254)
00131             break;
00132         skip_comment();
00133         --lineno_;
00134         next_char();
00135     }
00136 
00137     if (current_ == '#')
00138         skip_comment();
00139 
00140     tokenstart_lineno_ = lineno_;
00141 
00142     switch(current_) {
00143     case EOF:
00144         token_.type = token::END;
00145         break;
00146     case '"':
00147         token_.type = token::QSTRING;
00148         while (1) {
00149             next_char();
00150 
00151             if(current_ == EOF) {
00152                 token_.type = token::UNTERMINATED_QSTRING;
00153                 break;
00154             }
00155             if(current_ == '"' && peek_char() != '"')
00156                 break;
00157             if(current_ == '"' && peek_char() == '"')
00158                 next_char_fast();
00159             if (current_ == 254) {
00160                 skip_comment();
00161                 --lineno_;
00162                 continue;
00163             }
00164 
00165             token_.value += current_;
00166         };
00167         break;
00168     case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
00169         token_.type = token::token_type(current_);
00170         token_.value = current_;
00171         break;
00172     default:
00173         if(is_alnum(current_)) {
00174             token_.type = token::STRING;
00175             token_.value += current_;
00176             while(is_alnum(peek_char())) {
00177                 next_char_fast();
00178                 token_.value += current_;
00179             }
00180         } else {
00181             token_.type = token::MISC;
00182             token_.value += current_;
00183         }
00184         if(token_.value == "_")
00185             token_.type = token::token_type('_');
00186     }
00187 
00188     if(current_ != EOF)
00189         next_char();
00190 
00191     return token_;
00192 }
00193 
00194 const token& tokenizer::current_token() const
00195 {
00196     return token_;
00197 }
00198 
00199 
00200 bool tokenizer::is_space(const int c) const
00201 {
00202     return c == ' ' || c == '\t';
00203 }
00204 
00205 bool tokenizer::is_alnum(const int c) const
00206 {
00207     return (c >= 'a' && c <= 'z') 
00208         || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
00209 }
00210 
00211 std::string tokenizer::get_line() const
00212 {
00213     std::ostringstream s;
00214     s << tokenstart_lineno_ << ' ' << file_;
00215     return s.str();
00216 }
00217 
00218 std::string& tokenizer::textdomain()
00219 {
00220     return textdomain_;
00221 }
00222 
00223 
00224 

Generated by doxygen 1.5.5 on 23 May 2008 for The Battle for Wesnoth
Gna! | Forum | Wiki | CIA | devdocs