lexer.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. // Copyright (c) 2001-2011 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM)
  6. #define BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM
  7. #if defined(_MSC_VER)
  8. #pragma once
  9. #endif
  10. #include <boost/spirit/home/support/info.hpp>
  11. #include <boost/spirit/home/qi/skip_over.hpp>
  12. #include <boost/spirit/home/qi/parser.hpp>
  13. #include <boost/spirit/home/qi/detail/assign_to.hpp>
  14. #include <boost/spirit/home/lex/reference.hpp>
  15. #include <boost/spirit/home/lex/meta_compiler.hpp>
  16. #include <boost/spirit/home/lex/lexer_type.hpp>
  17. #include <boost/spirit/home/lex/lexer/token_def.hpp>
  18. #include <boost/assert.hpp>
  19. #include <boost/noncopyable.hpp>
  20. #include <boost/detail/iterator.hpp>
  21. #include <boost/fusion/include/vector.hpp>
  22. #include <boost/mpl/assert.hpp>
  23. #include <boost/range/iterator_range.hpp>
  24. #include <string>
  25. namespace boost { namespace spirit { namespace lex
  26. {
  27. ///////////////////////////////////////////////////////////////////////////
  28. namespace detail
  29. {
  30. ///////////////////////////////////////////////////////////////////////
  31. template <typename LexerDef>
  32. struct lexer_def_
  33. : proto::extends<
  34. typename proto::terminal<
  35. lex::reference<lexer_def_<LexerDef> const>
  36. >::type
  37. , lexer_def_<LexerDef> >
  38. , qi::parser<lexer_def_<LexerDef> >
  39. , lex::lexer_type<lexer_def_<LexerDef> >
  40. {
  41. private:
  42. // avoid warnings about using 'this' in constructor
  43. lexer_def_& this_() { return *this; }
  44. typedef typename LexerDef::char_type char_type;
  45. typedef typename LexerDef::string_type string_type;
  46. typedef typename LexerDef::id_type id_type;
  47. typedef lex::reference<lexer_def_ const> reference_;
  48. typedef typename proto::terminal<reference_>::type terminal_type;
  49. typedef proto::extends<terminal_type, lexer_def_> proto_base_type;
  50. reference_ alias() const
  51. {
  52. return reference_(*this);
  53. }
  54. public:
  55. // Qi interface: metafunction calculating parser attribute type
  56. template <typename Context, typename Iterator>
  57. struct attribute
  58. {
  59. // the return value of a token set contains the matched token
  60. // id, and the corresponding pair of iterators
  61. typedef typename Iterator::base_iterator_type iterator_type;
  62. typedef
  63. fusion::vector2<id_type, iterator_range<iterator_type> >
  64. type;
  65. };
  66. // Qi interface: parse functionality
  67. template <typename Iterator, typename Context
  68. , typename Skipper, typename Attribute>
  69. bool parse(Iterator& first, Iterator const& last
  70. , Context& /*context*/, Skipper const& skipper
  71. , Attribute& attr) const
  72. {
  73. qi::skip_over(first, last, skipper); // always do a pre-skip
  74. if (first != last) {
  75. typedef typename
  76. boost::detail::iterator_traits<Iterator>::value_type
  77. token_type;
  78. token_type const& t = *first;
  79. if (token_is_valid(t) && t.state() == first.get_state()) {
  80. // any of the token definitions matched
  81. spirit::traits::assign_to(t, attr);
  82. ++first;
  83. return true;
  84. }
  85. }
  86. return false;
  87. }
  88. // Qi interface: 'what' functionality
  89. template <typename Context>
  90. info what(Context& /*context*/) const
  91. {
  92. return info("lexer");
  93. }
  94. private:
  95. // allow to use the lexer.self.add("regex1", id1)("regex2", id2);
  96. // syntax
  97. struct adder
  98. {
  99. adder(lexer_def_& def_)
  100. : def(def_) {}
  101. // Add a token definition based on a single character as given
  102. // by the first parameter, the second parameter allows to
  103. // specify the token id to use for the new token. If no token
  104. // id is given the character code is used.
  105. adder const& operator()(char_type c
  106. , id_type token_id = id_type()) const
  107. {
  108. if (id_type() == token_id)
  109. token_id = static_cast<id_type>(c);
  110. def.def.add_token (def.state.c_str(), c, token_id
  111. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  112. return *this;
  113. }
  114. // Add a token definition based on a character sequence as
  115. // given by the first parameter, the second parameter allows to
  116. // specify the token id to use for the new token. If no token
  117. // id is given this function will generate a unique id to be
  118. // used as the token's id.
  119. adder const& operator()(string_type const& s
  120. , id_type token_id = id_type()) const
  121. {
  122. if (id_type() == token_id)
  123. token_id = def.def.get_next_id();
  124. def.def.add_token (def.state.c_str(), s, token_id
  125. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  126. return *this;
  127. }
  128. template <typename Attribute>
  129. adder const& operator()(
  130. token_def<Attribute, char_type, id_type>& tokdef
  131. , id_type token_id = id_type()) const
  132. {
  133. // make sure we have a token id
  134. if (id_type() == token_id) {
  135. if (id_type() == tokdef.id()) {
  136. token_id = def.def.get_next_id();
  137. tokdef.id(token_id);
  138. }
  139. else {
  140. token_id = tokdef.id();
  141. }
  142. }
  143. else {
  144. // the following assertion makes sure that the token_def
  145. // instance has not been assigned a different id earlier
  146. BOOST_ASSERT(id_type() == tokdef.id()
  147. || token_id == tokdef.id());
  148. tokdef.id(token_id);
  149. }
  150. def.define(tokdef);
  151. return *this;
  152. }
  153. // template <typename F>
  154. // adder const& operator()(char_type c, id_type token_id, F act) const
  155. // {
  156. // if (id_type() == token_id)
  157. // token_id = def.def.get_next_id();
  158. // std::size_t unique_id =
  159. // def.def.add_token (def.state.c_str(), s, token_id);
  160. // def.def.add_action(unique_id, def.state.c_str(), act);
  161. // return *this;
  162. // }
  163. lexer_def_& def;
  164. private:
  165. // silence MSVC warning C4512: assignment operator could not be generated
  166. adder& operator= (adder const&);
  167. };
  168. friend struct adder;
  169. // allow to use lexer.self.add_pattern("pattern1", "regex1")(...);
  170. // syntax
  171. struct pattern_adder
  172. {
  173. pattern_adder(lexer_def_& def_)
  174. : def(def_) {}
  175. pattern_adder const& operator()(string_type const& p
  176. , string_type const& s) const
  177. {
  178. def.def.add_pattern (def.state.c_str(), p, s);
  179. return *this;
  180. }
  181. lexer_def_& def;
  182. private:
  183. // silence MSVC warning C4512: assignment operator could not be generated
  184. pattern_adder& operator= (pattern_adder const&);
  185. };
  186. friend struct pattern_adder;
  187. private:
  188. // Helper function to invoke the necessary 2 step compilation
  189. // process on token definition expressions
  190. template <typename TokenExpr>
  191. void compile2pass(TokenExpr const& expr)
  192. {
  193. expr.collect(def, state, targetstate);
  194. expr.add_actions(def);
  195. }
  196. public:
  197. ///////////////////////////////////////////////////////////////////
  198. template <typename Expr>
  199. void define(Expr const& expr)
  200. {
  201. compile2pass(compile<lex::domain>(expr));
  202. }
  203. lexer_def_(LexerDef& def_, string_type const& state_
  204. , string_type const& targetstate_ = string_type())
  205. : proto_base_type(terminal_type::make(alias()))
  206. , add(this_()), add_pattern(this_()), def(def_)
  207. , state(state_), targetstate(targetstate_)
  208. {}
  209. // allow to switch states
  210. lexer_def_ operator()(char_type const* state) const
  211. {
  212. return lexer_def_(def, state);
  213. }
  214. lexer_def_ operator()(char_type const* state
  215. , char_type const* targetstate) const
  216. {
  217. return lexer_def_(def, state, targetstate);
  218. }
  219. lexer_def_ operator()(string_type const& state
  220. , string_type const& targetstate = string_type()) const
  221. {
  222. return lexer_def_(def, state, targetstate);
  223. }
  224. // allow to assign a token definition expression
  225. template <typename Expr>
  226. lexer_def_& operator= (Expr const& xpr)
  227. {
  228. // Report invalid expression error as early as possible.
  229. // If you got an error_invalid_expression error message here,
  230. // then the expression (expr) is not a valid spirit lex
  231. // expression.
  232. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  233. def.clear(state.c_str());
  234. define(xpr);
  235. return *this;
  236. }
  237. // explicitly tell the lexer that the given state will be defined
  238. // (useful in conjunction with "*")
  239. std::size_t add_state(char_type const* state = 0)
  240. {
  241. return def.add_state(state ? state : def.initial_state().c_str());
  242. }
  243. adder add;
  244. pattern_adder add_pattern;
  245. private:
  246. LexerDef& def;
  247. string_type state;
  248. string_type targetstate;
  249. private:
  250. // silence MSVC warning C4512: assignment operator could not be generated
  251. lexer_def_& operator= (lexer_def_ const&);
  252. };
  253. #if defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
  254. // allow to assign a token definition expression
  255. template <typename LexerDef, typename Expr>
  256. inline lexer_def_<LexerDef>&
  257. operator+= (lexer_def_<LexerDef>& lexdef, Expr& xpr)
  258. {
  259. // Report invalid expression error as early as possible.
  260. // If you got an error_invalid_expression error message here,
  261. // then the expression (expr) is not a valid spirit lex
  262. // expression.
  263. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  264. lexdef.define(xpr);
  265. return lexdef;
  266. }
  267. #else
  268. // allow to assign a token definition expression
  269. template <typename LexerDef, typename Expr>
  270. inline lexer_def_<LexerDef>&
  271. operator+= (lexer_def_<LexerDef>& lexdef, Expr&& xpr)
  272. {
  273. // Report invalid expression error as early as possible.
  274. // If you got an error_invalid_expression error message here,
  275. // then the expression (expr) is not a valid spirit lex
  276. // expression.
  277. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  278. lexdef.define(xpr);
  279. return lexdef;
  280. }
  281. #endif
  282. template <typename LexerDef, typename Expr>
  283. inline lexer_def_<LexerDef>&
  284. operator+= (lexer_def_<LexerDef>& lexdef, Expr const& xpr)
  285. {
  286. // Report invalid expression error as early as possible.
  287. // If you got an error_invalid_expression error message here,
  288. // then the expression (expr) is not a valid spirit lex
  289. // expression.
  290. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  291. lexdef.define(xpr);
  292. return lexdef;
  293. }
  294. }
  295. ///////////////////////////////////////////////////////////////////////////
  296. // The match_flags flags are used to influence different matching
  297. // modes of the lexer
  298. struct match_flags
  299. {
  300. enum enum_type
  301. {
  302. match_default = 0, // no flags
  303. match_not_dot_newline = 1, // the regex '.' doesn't match newlines
  304. match_icase = 2 // all matching operations are case insensitive
  305. };
  306. };
  307. ///////////////////////////////////////////////////////////////////////////
  308. // This represents a lexer object
  309. ///////////////////////////////////////////////////////////////////////////
  310. ///////////////////////////////////////////////////////////////////////////
  311. // This is the first token id automatically assigned by the library
  312. // if needed
  313. enum tokenids
  314. {
  315. min_token_id = 0x10000
  316. };
  317. template <typename Lexer>
  318. class lexer : public Lexer
  319. {
  320. private:
  321. // avoid warnings about using 'this' in constructor
  322. lexer& this_() { return *this; }
  323. std::size_t next_token_id; // has to be an integral type
  324. public:
  325. typedef Lexer lexer_type;
  326. typedef typename Lexer::id_type id_type;
  327. typedef typename Lexer::char_type char_type;
  328. typedef typename Lexer::iterator_type iterator_type;
  329. typedef lexer base_type;
  330. typedef detail::lexer_def_<lexer> lexer_def;
  331. typedef std::basic_string<char_type> string_type;
  332. lexer(unsigned int flags = match_flags::match_default
  333. , id_type first_id = id_type(min_token_id))
  334. : lexer_type(flags)
  335. , next_token_id(first_id)
  336. , self(this_(), lexer_type::initial_state())
  337. {}
  338. // access iterator interface
  339. template <typename Iterator>
  340. iterator_type begin(Iterator& first, Iterator const& last
  341. , char_type const* initial_state = 0) const
  342. { return this->lexer_type::begin(first, last, initial_state); }
  343. iterator_type end() const
  344. { return this->lexer_type::end(); }
  345. std::size_t map_state(char_type const* state)
  346. { return this->lexer_type::add_state(state); }
  347. // create a unique token id
  348. id_type get_next_id() { return id_type(next_token_id++); }
  349. lexer_def self; // allow for easy token definition
  350. };
  351. }}}
  352. #endif