perl_matcher.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. /*
  2. *
  3. * Copyright (c) 2002
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. #ifndef BOOST_REGEX_MATCHER_HPP
  12. #define BOOST_REGEX_MATCHER_HPP
  13. #include <boost/regex/v4/iterator_category.hpp>
  14. #ifdef BOOST_MSVC
  15. #pragma warning(push)
  16. #pragma warning(disable: 4103)
  17. #endif
  18. #ifdef BOOST_HAS_ABI_HEADERS
  19. # include BOOST_ABI_PREFIX
  20. #endif
  21. #ifdef BOOST_MSVC
  22. #pragma warning(pop)
  23. #endif
  24. #ifdef BOOST_MSVC
  25. # pragma warning(push)
  26. # pragma warning(disable: 4800)
  27. #endif
  28. namespace boost{
  29. namespace re_detail{
  30. //
  31. // error checking API:
  32. //
  33. BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
  34. //
  35. // function can_start:
  36. //
  37. template <class charT>
  38. inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
  39. {
  40. return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
  41. }
  42. inline bool can_start(char c, const unsigned char* map, unsigned char mask)
  43. {
  44. return map[(unsigned char)c] & mask;
  45. }
  46. inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
  47. {
  48. return map[(unsigned char)c] & mask;
  49. }
  50. inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
  51. {
  52. return map[c] & mask;
  53. }
  54. inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
  55. {
  56. return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
  57. }
  58. #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
  59. #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
  60. inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
  61. {
  62. return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
  63. }
  64. #endif
  65. #endif
  66. #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
  67. inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
  68. {
  69. return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
  70. }
  71. #endif
  72. //
  73. // Unfortunately Rogue Waves standard library appears to have a bug
  74. // in std::basic_string::compare that results in eroneous answers
  75. // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
  76. // 0x020101) the test case was:
  77. // {39135,0} < {0xff,0}
  78. // which succeeds when it should not.
  79. //
  80. #ifndef _RWSTD_VER
  81. #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310)
  82. template <class C, class T, class A>
  83. inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
  84. {
  85. if(0 == *p)
  86. {
  87. if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
  88. return 0;
  89. }
  90. return s.compare(p);
  91. }
  92. #endif
  93. #else
  94. #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310)
  95. template <class C, class T, class A>
  96. inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
  97. {
  98. if(0 == *p)
  99. {
  100. if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
  101. return 0;
  102. }
  103. return s.compare(p);
  104. }
  105. #endif
  106. inline int string_compare(const std::string& s, const char* p)
  107. { return std::strcmp(s.c_str(), p); }
  108. # ifndef BOOST_NO_WREGEX
  109. inline int string_compare(const std::wstring& s, const wchar_t* p)
  110. { return std::wcscmp(s.c_str(), p); }
  111. #endif
  112. #endif
  113. template <class Seq, class C>
  114. inline int string_compare(const Seq& s, const C* p)
  115. {
  116. std::size_t i = 0;
  117. while((i < s.size()) && (p[i] == s[i]))
  118. {
  119. ++i;
  120. }
  121. return (i == s.size()) ? -p[i] : s[i] - p[i];
  122. }
  123. # define STR_COMP(s,p) string_compare(s,p)
  124. template<class charT>
  125. inline const charT* re_skip_past_null(const charT* p)
  126. {
  127. while (*p != static_cast<charT>(0)) ++p;
  128. return ++p;
  129. }
  130. template <class iterator, class charT, class traits_type, class char_classT>
  131. iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
  132. iterator last,
  133. const re_set_long<char_classT>* set_,
  134. const regex_data<charT, traits_type>& e, bool icase)
  135. {
  136. const charT* p = reinterpret_cast<const charT*>(set_+1);
  137. iterator ptr;
  138. unsigned int i;
  139. //bool icase = e.m_flags & regex_constants::icase;
  140. if(next == last) return next;
  141. typedef typename traits_type::string_type traits_string_type;
  142. const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
  143. // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
  144. // referenced
  145. (void)traits_inst;
  146. // try and match a single character, could be a multi-character
  147. // collating element...
  148. for(i = 0; i < set_->csingles; ++i)
  149. {
  150. ptr = next;
  151. if(*p == static_cast<charT>(0))
  152. {
  153. // treat null string as special case:
  154. if(traits_inst.translate(*ptr, icase) != *p)
  155. {
  156. while(*p == static_cast<charT>(0))++p;
  157. continue;
  158. }
  159. return set_->isnot ? next : (ptr == next) ? ++next : ptr;
  160. }
  161. else
  162. {
  163. while(*p && (ptr != last))
  164. {
  165. if(traits_inst.translate(*ptr, icase) != *p)
  166. break;
  167. ++p;
  168. ++ptr;
  169. }
  170. if(*p == static_cast<charT>(0)) // if null we've matched
  171. return set_->isnot ? next : (ptr == next) ? ++next : ptr;
  172. p = re_skip_past_null(p); // skip null
  173. }
  174. }
  175. charT col = traits_inst.translate(*next, icase);
  176. if(set_->cranges || set_->cequivalents)
  177. {
  178. traits_string_type s1;
  179. //
  180. // try and match a range, NB only a single character can match
  181. if(set_->cranges)
  182. {
  183. if((e.m_flags & regex_constants::collate) == 0)
  184. s1.assign(1, col);
  185. else
  186. {
  187. charT a[2] = { col, charT(0), };
  188. s1 = traits_inst.transform(a, a + 1);
  189. }
  190. for(i = 0; i < set_->cranges; ++i)
  191. {
  192. if(STR_COMP(s1, p) >= 0)
  193. {
  194. do{ ++p; }while(*p);
  195. ++p;
  196. if(STR_COMP(s1, p) <= 0)
  197. return set_->isnot ? next : ++next;
  198. }
  199. else
  200. {
  201. // skip first string
  202. do{ ++p; }while(*p);
  203. ++p;
  204. }
  205. // skip second string
  206. do{ ++p; }while(*p);
  207. ++p;
  208. }
  209. }
  210. //
  211. // try and match an equivalence class, NB only a single character can match
  212. if(set_->cequivalents)
  213. {
  214. charT a[2] = { col, charT(0), };
  215. s1 = traits_inst.transform_primary(a, a +1);
  216. for(i = 0; i < set_->cequivalents; ++i)
  217. {
  218. if(STR_COMP(s1, p) == 0)
  219. return set_->isnot ? next : ++next;
  220. // skip string
  221. do{ ++p; }while(*p);
  222. ++p;
  223. }
  224. }
  225. }
  226. if(traits_inst.isctype(col, set_->cclasses) == true)
  227. return set_->isnot ? next : ++next;
  228. if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
  229. return set_->isnot ? next : ++next;
  230. return set_->isnot ? ++next : next;
  231. }
  232. template <class BidiIterator>
  233. class repeater_count
  234. {
  235. repeater_count** stack;
  236. repeater_count* next;
  237. int state_id;
  238. std::size_t count; // the number of iterations so far
  239. BidiIterator start_pos; // where the last repeat started
  240. public:
  241. repeater_count(repeater_count** s)
  242. {
  243. stack = s;
  244. next = 0;
  245. state_id = -1;
  246. count = 0;
  247. }
  248. repeater_count(int i, repeater_count** s, BidiIterator start)
  249. : start_pos(start)
  250. {
  251. state_id = i;
  252. stack = s;
  253. next = *stack;
  254. *stack = this;
  255. if(state_id > next->state_id)
  256. count = 0;
  257. else
  258. {
  259. repeater_count* p = next;
  260. while(p && (p->state_id != state_id))
  261. p = p->next;
  262. if(p)
  263. {
  264. count = p->count;
  265. start_pos = p->start_pos;
  266. }
  267. else
  268. count = 0;
  269. }
  270. }
  271. ~repeater_count()
  272. {
  273. if(next)
  274. *stack = next;
  275. }
  276. std::size_t get_count() { return count; }
  277. int get_id() { return state_id; }
  278. std::size_t operator++() { return ++count; }
  279. bool check_null_repeat(const BidiIterator& pos, std::size_t max)
  280. {
  281. // this is called when we are about to start a new repeat,
  282. // if the last one was NULL move our count to max,
  283. // otherwise save the current position.
  284. bool result = (count == 0) ? false : (pos == start_pos);
  285. if(result)
  286. count = max;
  287. else
  288. start_pos = pos;
  289. return result;
  290. }
  291. };
  292. struct saved_state;
  293. enum saved_state_type
  294. {
  295. saved_type_end = 0,
  296. saved_type_paren = 1,
  297. saved_type_recurse = 2,
  298. saved_type_assertion = 3,
  299. saved_state_alt = 4,
  300. saved_state_repeater_count = 5,
  301. saved_state_extra_block = 6,
  302. saved_state_greedy_single_repeat = 7,
  303. saved_state_rep_slow_dot = 8,
  304. saved_state_rep_fast_dot = 9,
  305. saved_state_rep_char = 10,
  306. saved_state_rep_short_set = 11,
  307. saved_state_rep_long_set = 12,
  308. saved_state_non_greedy_long_repeat = 13,
  309. saved_state_count = 14
  310. };
  311. template <class Results>
  312. struct recursion_info
  313. {
  314. typedef typename Results::value_type value_type;
  315. typedef typename value_type::iterator iterator;
  316. int idx;
  317. const re_syntax_base* preturn_address;
  318. Results results;
  319. repeater_count<iterator>* repeater_stack;
  320. };
  321. #ifdef BOOST_MSVC
  322. #pragma warning(push)
  323. #pragma warning(disable : 4251 4231)
  324. # if BOOST_MSVC < 1600
  325. # pragma warning(disable : 4660)
  326. # endif
  327. #endif
  328. template <class BidiIterator, class Allocator, class traits>
  329. class perl_matcher
  330. {
  331. public:
  332. typedef typename traits::char_type char_type;
  333. typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
  334. typedef bool (self_type::*matcher_proc_type)(void);
  335. typedef std::size_t traits_size_type;
  336. typedef typename is_byte<char_type>::width_type width_type;
  337. typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
  338. typedef match_results<BidiIterator, Allocator> results_type;
  339. perl_matcher(BidiIterator first, BidiIterator end,
  340. match_results<BidiIterator, Allocator>& what,
  341. const basic_regex<char_type, traits>& e,
  342. match_flag_type f,
  343. BidiIterator l_base)
  344. : m_result(what), base(first), last(end),
  345. position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
  346. m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
  347. {
  348. construct_init(e, f);
  349. }
  350. bool match();
  351. bool find();
  352. void setf(match_flag_type f)
  353. { m_match_flags |= f; }
  354. void unsetf(match_flag_type f)
  355. { m_match_flags &= ~f; }
  356. private:
  357. void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
  358. bool find_imp();
  359. bool match_imp();
  360. #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
  361. typedef bool (perl_matcher::*protected_proc_type)();
  362. bool protected_call(protected_proc_type);
  363. #endif
  364. void estimate_max_state_count(std::random_access_iterator_tag*);
  365. void estimate_max_state_count(void*);
  366. bool match_prefix();
  367. bool match_all_states();
  368. // match procs, stored in s_match_vtable:
  369. bool match_startmark();
  370. bool match_endmark();
  371. bool match_literal();
  372. bool match_start_line();
  373. bool match_end_line();
  374. bool match_wild();
  375. bool match_match();
  376. bool match_word_boundary();
  377. bool match_within_word();
  378. bool match_word_start();
  379. bool match_word_end();
  380. bool match_buffer_start();
  381. bool match_buffer_end();
  382. bool match_backref();
  383. bool match_long_set();
  384. bool match_set();
  385. bool match_jump();
  386. bool match_alt();
  387. bool match_rep();
  388. bool match_combining();
  389. bool match_soft_buffer_end();
  390. bool match_restart_continue();
  391. bool match_long_set_repeat();
  392. bool match_set_repeat();
  393. bool match_char_repeat();
  394. bool match_dot_repeat_fast();
  395. bool match_dot_repeat_slow();
  396. bool match_dot_repeat_dispatch()
  397. {
  398. return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
  399. }
  400. bool match_backstep();
  401. bool match_assert_backref();
  402. bool match_toggle_case();
  403. #ifdef BOOST_REGEX_RECURSIVE
  404. bool backtrack_till_match(std::size_t count);
  405. #endif
  406. bool match_recursion();
  407. // find procs stored in s_find_vtable:
  408. bool find_restart_any();
  409. bool find_restart_word();
  410. bool find_restart_line();
  411. bool find_restart_buf();
  412. bool find_restart_lit();
  413. private:
  414. // final result structure to be filled in:
  415. match_results<BidiIterator, Allocator>& m_result;
  416. // temporary result for POSIX matches:
  417. scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
  418. // pointer to actual result structure to fill in:
  419. match_results<BidiIterator, Allocator>* m_presult;
  420. // start of sequence being searched:
  421. BidiIterator base;
  422. // end of sequence being searched:
  423. BidiIterator last;
  424. // current character being examined:
  425. BidiIterator position;
  426. // where to restart next search after failed match attempt:
  427. BidiIterator restart;
  428. // where the current search started from, acts as base for $` during grep:
  429. BidiIterator search_base;
  430. // how far we can go back when matching lookbehind:
  431. BidiIterator backstop;
  432. // the expression being examined:
  433. const basic_regex<char_type, traits>& re;
  434. // the expression's traits class:
  435. const ::boost::regex_traits_wrapper<traits>& traits_inst;
  436. // the next state in the machine being matched:
  437. const re_syntax_base* pstate;
  438. // matching flags in use:
  439. match_flag_type m_match_flags;
  440. // how many states we have examined so far:
  441. std::ptrdiff_t state_count;
  442. // max number of states to examine before giving up:
  443. std::ptrdiff_t max_state_count;
  444. // whether we should ignore case or not:
  445. bool icase;
  446. // set to true when (position == last), indicates that we may have a partial match:
  447. bool m_has_partial_match;
  448. // set to true whenever we get a match:
  449. bool m_has_found_match;
  450. // set to true whenever we're inside an independent sub-expression:
  451. bool m_independent;
  452. // the current repeat being examined:
  453. repeater_count<BidiIterator>* next_count;
  454. // the first repeat being examined (top of linked list):
  455. repeater_count<BidiIterator> rep_obj;
  456. // the mask to pass when matching word boundaries:
  457. typename traits::char_class_type m_word_mask;
  458. // the bitmask to use when determining whether a match_any matches a newline or not:
  459. unsigned char match_any_mask;
  460. // recursion information:
  461. std::vector<recursion_info<results_type> > recursion_stack;
  462. #ifdef BOOST_REGEX_NON_RECURSIVE
  463. //
  464. // additional members for non-recursive version:
  465. //
  466. typedef bool (self_type::*unwind_proc_type)(bool);
  467. void extend_stack();
  468. bool unwind(bool);
  469. bool unwind_end(bool);
  470. bool unwind_paren(bool);
  471. bool unwind_recursion_stopper(bool);
  472. bool unwind_assertion(bool);
  473. bool unwind_alt(bool);
  474. bool unwind_repeater_counter(bool);
  475. bool unwind_extra_block(bool);
  476. bool unwind_greedy_single_repeat(bool);
  477. bool unwind_slow_dot_repeat(bool);
  478. bool unwind_fast_dot_repeat(bool);
  479. bool unwind_char_repeat(bool);
  480. bool unwind_short_set_repeat(bool);
  481. bool unwind_long_set_repeat(bool);
  482. bool unwind_non_greedy_repeat(bool);
  483. bool unwind_recursion(bool);
  484. bool unwind_recursion_pop(bool);
  485. void destroy_single_repeat();
  486. void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
  487. void push_recursion_stopper();
  488. void push_assertion(const re_syntax_base* ps, bool positive);
  489. void push_alt(const re_syntax_base* ps);
  490. void push_repeater_count(int i, repeater_count<BidiIterator>** s);
  491. void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
  492. void push_non_greedy_repeat(const re_syntax_base* ps);
  493. void push_recursion(int idx, const re_syntax_base* p, results_type* presults);
  494. void push_recursion_pop();
  495. // pointer to base of stack:
  496. saved_state* m_stack_base;
  497. // pointer to current stack position:
  498. saved_state* m_backup_state;
  499. // determines what value to return when unwinding from recursion,
  500. // allows for mixed recursive/non-recursive algorithm:
  501. bool m_recursive_result;
  502. // how many memory blocks have we used up?:
  503. unsigned used_block_count;
  504. #endif
  505. // these operations aren't allowed, so are declared private,
  506. // bodies are provided to keep explicit-instantiation requests happy:
  507. perl_matcher& operator=(const perl_matcher&)
  508. {
  509. return *this;
  510. }
  511. perl_matcher(const perl_matcher& that)
  512. : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
  513. };
  514. #ifdef BOOST_MSVC
  515. #pragma warning(pop)
  516. #endif
  517. } // namespace re_detail
  518. #ifdef BOOST_MSVC
  519. #pragma warning(push)
  520. #pragma warning(disable: 4103)
  521. #endif
  522. #ifdef BOOST_HAS_ABI_HEADERS
  523. # include BOOST_ABI_SUFFIX
  524. #endif
  525. #ifdef BOOST_MSVC
  526. #pragma warning(pop)
  527. #endif
  528. } // namespace boost
  529. #ifdef BOOST_MSVC
  530. # pragma warning(pop)
  531. #endif
  532. //
  533. // include the implementation of perl_matcher:
  534. //
  535. #ifdef BOOST_REGEX_RECURSIVE
  536. #include <boost/regex/v4/perl_matcher_recursive.hpp>
  537. #else
  538. #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
  539. #endif
  540. // this one has to be last:
  541. #include <boost/regex/v4/perl_matcher_common.hpp>
  542. #endif