basic_regex_parser.hpp 99 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE basic_regex_parser.cpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Declares template class basic_regex_parser.
  16. */
  17. #ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
  18. #define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
  19. #ifdef BOOST_MSVC
  20. #pragma warning(push)
  21. #pragma warning(disable: 4103)
  22. #endif
  23. #ifdef BOOST_HAS_ABI_HEADERS
  24. # include BOOST_ABI_PREFIX
  25. #endif
  26. #ifdef BOOST_MSVC
  27. #pragma warning(pop)
  28. #endif
  29. namespace boost{
  30. namespace re_detail{
  31. #ifdef BOOST_MSVC
  32. #pragma warning(push)
  33. #pragma warning(disable:4244 4800)
  34. #endif
  35. template <class charT, class traits>
  36. class basic_regex_parser : public basic_regex_creator<charT, traits>
  37. {
  38. public:
  39. basic_regex_parser(regex_data<charT, traits>* data);
  40. void parse(const charT* p1, const charT* p2, unsigned flags);
  41. void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
  42. void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
  43. void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
  44. {
  45. fail(error_code, position, message, position);
  46. }
  47. bool parse_all();
  48. bool parse_basic();
  49. bool parse_extended();
  50. bool parse_literal();
  51. bool parse_open_paren();
  52. bool parse_basic_escape();
  53. bool parse_extended_escape();
  54. bool parse_match_any();
  55. bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
  56. bool parse_repeat_range(bool isbasic);
  57. bool parse_alt();
  58. bool parse_set();
  59. bool parse_backref();
  60. void parse_set_literal(basic_char_set<charT, traits>& char_set);
  61. bool parse_inner_set(basic_char_set<charT, traits>& char_set);
  62. bool parse_QE();
  63. bool parse_perl_extension();
  64. bool add_emacs_code(bool negate);
  65. bool unwind_alts(std::ptrdiff_t last_paren_start);
  66. digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
  67. charT unescape_character();
  68. regex_constants::syntax_option_type parse_options();
  69. private:
  70. typedef bool (basic_regex_parser::*parser_proc_type)();
  71. typedef typename traits::string_type string_type;
  72. typedef typename traits::char_class_type char_class_type;
  73. parser_proc_type m_parser_proc; // the main parser to use
  74. const charT* m_base; // the start of the string being parsed
  75. const charT* m_end; // the end of the string being parsed
  76. const charT* m_position; // our current parser position
  77. unsigned m_mark_count; // how many sub-expressions we have
  78. int m_mark_reset; // used to indicate that we're inside a (?|...) block.
  79. unsigned m_max_mark; // largest mark count seen inside a (?|...) block.
  80. std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
  81. std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
  82. bool m_has_case_change; // true if somewhere in the current block the case has changed
  83. #if defined(BOOST_MSVC) && defined(_M_IX86)
  84. // This is an ugly warning suppression workaround (for warnings *inside* std::vector
  85. // that can not otherwise be suppressed)...
  86. BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
  87. std::vector<long> m_alt_jumps; // list of alternative in the current scope.
  88. #else
  89. std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
  90. #endif
  91. basic_regex_parser& operator=(const basic_regex_parser&);
  92. basic_regex_parser(const basic_regex_parser&);
  93. };
  94. template <class charT, class traits>
  95. basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
  96. : basic_regex_creator<charT, traits>(data), m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false)
  97. {
  98. }
  99. template <class charT, class traits>
  100. void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
  101. {
  102. // pass l_flags on to base class:
  103. this->init(l_flags);
  104. // set up pointers:
  105. m_position = m_base = p1;
  106. m_end = p2;
  107. // empty strings are errors:
  108. if((p1 == p2) &&
  109. (
  110. ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
  111. || (l_flags & regbase::no_empty_expressions)
  112. )
  113. )
  114. {
  115. fail(regex_constants::error_empty, 0);
  116. return;
  117. }
  118. // select which parser to use:
  119. switch(l_flags & regbase::main_option_type)
  120. {
  121. case regbase::perl_syntax_group:
  122. {
  123. m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
  124. //
  125. // Add a leading paren with index zero to give recursions a target:
  126. //
  127. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  128. br->index = 0;
  129. br->icase = this->flags() & regbase::icase;
  130. break;
  131. }
  132. case regbase::basic_syntax_group:
  133. m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
  134. break;
  135. case regbase::literal:
  136. m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
  137. break;
  138. default:
  139. // Ooops, someone has managed to set more than one of the main option flags,
  140. // so this must be an error:
  141. fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
  142. return;
  143. }
  144. // parse all our characters:
  145. bool result = parse_all();
  146. //
  147. // Unwind our alternatives:
  148. //
  149. unwind_alts(-1);
  150. // reset l_flags as a global scope (?imsx) may have altered them:
  151. this->flags(l_flags);
  152. // if we haven't gobbled up all the characters then we must
  153. // have had an unexpected ')' :
  154. if(!result)
  155. {
  156. fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_position), "Found a closing ) with no corresponding openening parenthesis.");
  157. return;
  158. }
  159. // if an error has been set then give up now:
  160. if(this->m_pdata->m_status)
  161. return;
  162. // fill in our sub-expression count:
  163. this->m_pdata->m_mark_count = 1 + m_mark_count;
  164. this->finalize(p1, p2);
  165. }
  166. template <class charT, class traits>
  167. void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
  168. {
  169. // get the error message:
  170. std::string message = this->m_pdata->m_ptraits->error_string(error_code);
  171. fail(error_code, position, message);
  172. }
  173. template <class charT, class traits>
  174. void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
  175. {
  176. if(0 == this->m_pdata->m_status) // update the error code if not already set
  177. this->m_pdata->m_status = error_code;
  178. m_position = m_end; // don't bother parsing anything else
  179. #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
  180. //
  181. // Augment error message with the regular expression text:
  182. //
  183. if(start_pos == position)
  184. start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
  185. std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
  186. if(error_code != regex_constants::error_empty)
  187. {
  188. if((start_pos != 0) || (end_pos != (m_end - m_base)))
  189. message += " The error occurred while parsing the regular expression fragment: '";
  190. else
  191. message += " The error occurred while parsing the regular expression: '";
  192. if(start_pos != end_pos)
  193. {
  194. message += std::string(m_base + start_pos, m_base + position);
  195. message += ">>>HERE>>>";
  196. message += std::string(m_base + position, m_base + end_pos);
  197. }
  198. message += "'.";
  199. }
  200. #endif
  201. #ifndef BOOST_NO_EXCEPTIONS
  202. if(0 == (this->flags() & regex_constants::no_except))
  203. {
  204. boost::regex_error e(message, error_code, position);
  205. e.raise();
  206. }
  207. #else
  208. (void)position; // suppress warnings.
  209. #endif
  210. }
  211. template <class charT, class traits>
  212. bool basic_regex_parser<charT, traits>::parse_all()
  213. {
  214. bool result = true;
  215. while(result && (m_position != m_end))
  216. {
  217. result = (this->*m_parser_proc)();
  218. }
  219. return result;
  220. }
  221. #ifdef BOOST_MSVC
  222. #pragma warning(push)
  223. #pragma warning(disable:4702)
  224. #endif
  225. template <class charT, class traits>
  226. bool basic_regex_parser<charT, traits>::parse_basic()
  227. {
  228. switch(this->m_traits.syntax_type(*m_position))
  229. {
  230. case regex_constants::syntax_escape:
  231. return parse_basic_escape();
  232. case regex_constants::syntax_dot:
  233. return parse_match_any();
  234. case regex_constants::syntax_caret:
  235. ++m_position;
  236. this->append_state(syntax_element_start_line);
  237. break;
  238. case regex_constants::syntax_dollar:
  239. ++m_position;
  240. this->append_state(syntax_element_end_line);
  241. break;
  242. case regex_constants::syntax_star:
  243. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
  244. return parse_literal();
  245. else
  246. {
  247. ++m_position;
  248. return parse_repeat();
  249. }
  250. case regex_constants::syntax_plus:
  251. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
  252. return parse_literal();
  253. else
  254. {
  255. ++m_position;
  256. return parse_repeat(1);
  257. }
  258. case regex_constants::syntax_question:
  259. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
  260. return parse_literal();
  261. else
  262. {
  263. ++m_position;
  264. return parse_repeat(0, 1);
  265. }
  266. case regex_constants::syntax_open_set:
  267. return parse_set();
  268. case regex_constants::syntax_newline:
  269. if(this->flags() & regbase::newline_alt)
  270. return parse_alt();
  271. else
  272. return parse_literal();
  273. default:
  274. return parse_literal();
  275. }
  276. return true;
  277. }
  278. template <class charT, class traits>
  279. bool basic_regex_parser<charT, traits>::parse_extended()
  280. {
  281. bool result = true;
  282. switch(this->m_traits.syntax_type(*m_position))
  283. {
  284. case regex_constants::syntax_open_mark:
  285. return parse_open_paren();
  286. case regex_constants::syntax_close_mark:
  287. return false;
  288. case regex_constants::syntax_escape:
  289. return parse_extended_escape();
  290. case regex_constants::syntax_dot:
  291. return parse_match_any();
  292. case regex_constants::syntax_caret:
  293. ++m_position;
  294. this->append_state(
  295. (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
  296. break;
  297. case regex_constants::syntax_dollar:
  298. ++m_position;
  299. this->append_state(
  300. (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
  301. break;
  302. case regex_constants::syntax_star:
  303. if(m_position == this->m_base)
  304. {
  305. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
  306. return false;
  307. }
  308. ++m_position;
  309. return parse_repeat();
  310. case regex_constants::syntax_question:
  311. if(m_position == this->m_base)
  312. {
  313. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
  314. return false;
  315. }
  316. ++m_position;
  317. return parse_repeat(0,1);
  318. case regex_constants::syntax_plus:
  319. if(m_position == this->m_base)
  320. {
  321. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
  322. return false;
  323. }
  324. ++m_position;
  325. return parse_repeat(1);
  326. case regex_constants::syntax_open_brace:
  327. ++m_position;
  328. return parse_repeat_range(false);
  329. case regex_constants::syntax_close_brace:
  330. fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
  331. return false;
  332. case regex_constants::syntax_or:
  333. return parse_alt();
  334. case regex_constants::syntax_open_set:
  335. return parse_set();
  336. case regex_constants::syntax_newline:
  337. if(this->flags() & regbase::newline_alt)
  338. return parse_alt();
  339. else
  340. return parse_literal();
  341. case regex_constants::syntax_hash:
  342. //
  343. // If we have a mod_x flag set, then skip until
  344. // we get to a newline character:
  345. //
  346. if((this->flags()
  347. & (regbase::no_perl_ex|regbase::mod_x))
  348. == regbase::mod_x)
  349. {
  350. while((m_position != m_end) && !is_separator(*m_position++)){}
  351. return true;
  352. }
  353. BOOST_FALLTHROUGH;
  354. default:
  355. result = parse_literal();
  356. break;
  357. }
  358. return result;
  359. }
  360. #ifdef BOOST_MSVC
  361. #pragma warning(pop)
  362. #endif
  363. template <class charT, class traits>
  364. bool basic_regex_parser<charT, traits>::parse_literal()
  365. {
  366. // append this as a literal provided it's not a space character
  367. // or the perl option regbase::mod_x is not set:
  368. if(
  369. ((this->flags()
  370. & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
  371. != regbase::mod_x)
  372. || !this->m_traits.isctype(*m_position, this->m_mask_space))
  373. this->append_literal(*m_position);
  374. ++m_position;
  375. return true;
  376. }
  377. template <class charT, class traits>
  378. bool basic_regex_parser<charT, traits>::parse_open_paren()
  379. {
  380. //
  381. // skip the '(' and error check:
  382. //
  383. if(++m_position == m_end)
  384. {
  385. fail(regex_constants::error_paren, m_position - m_base);
  386. return false;
  387. }
  388. //
  389. // begin by checking for a perl-style (?...) extension:
  390. //
  391. if(
  392. ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
  393. || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
  394. )
  395. {
  396. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
  397. return parse_perl_extension();
  398. }
  399. //
  400. // update our mark count, and append the required state:
  401. //
  402. unsigned markid = 0;
  403. if(0 == (this->flags() & regbase::nosubs))
  404. {
  405. markid = ++m_mark_count;
  406. #ifndef BOOST_NO_STD_DISTANCE
  407. if(this->flags() & regbase::save_subexpression_location)
  408. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
  409. #else
  410. if(this->flags() & regbase::save_subexpression_location)
  411. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0));
  412. #endif
  413. }
  414. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  415. pb->index = markid;
  416. pb->icase = this->flags() & regbase::icase;
  417. std::ptrdiff_t last_paren_start = this->getoffset(pb);
  418. // back up insertion point for alternations, and set new point:
  419. std::ptrdiff_t last_alt_point = m_alt_insert_point;
  420. this->m_pdata->m_data.align();
  421. m_alt_insert_point = this->m_pdata->m_data.size();
  422. //
  423. // back up the current flags in case we have a nested (?imsx) group:
  424. //
  425. regex_constants::syntax_option_type opts = this->flags();
  426. bool old_case_change = m_has_case_change;
  427. m_has_case_change = false; // no changes to this scope as yet...
  428. //
  429. // Back up branch reset data in case we have a nested (?|...)
  430. //
  431. int mark_reset = m_mark_reset;
  432. m_mark_reset = -1;
  433. //
  434. // now recursively add more states, this will terminate when we get to a
  435. // matching ')' :
  436. //
  437. parse_all();
  438. //
  439. // Unwind pushed alternatives:
  440. //
  441. if(0 == unwind_alts(last_paren_start))
  442. return false;
  443. //
  444. // restore flags:
  445. //
  446. if(m_has_case_change)
  447. {
  448. // the case has changed in one or more of the alternatives
  449. // within the scoped (...) block: we have to add a state
  450. // to reset the case sensitivity:
  451. static_cast<re_case*>(
  452. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  453. )->icase = opts & regbase::icase;
  454. }
  455. this->flags(opts);
  456. m_has_case_change = old_case_change;
  457. //
  458. // restore branch reset:
  459. //
  460. m_mark_reset = mark_reset;
  461. //
  462. // we either have a ')' or we have run out of characters prematurely:
  463. //
  464. if(m_position == m_end)
  465. {
  466. this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
  467. return false;
  468. }
  469. BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
  470. #ifndef BOOST_NO_STD_DISTANCE
  471. if(markid && (this->flags() & regbase::save_subexpression_location))
  472. this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
  473. #else
  474. if(markid && (this->flags() & regbase::save_subexpression_location))
  475. this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base);
  476. #endif
  477. ++m_position;
  478. //
  479. // append closing parenthesis state:
  480. //
  481. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  482. pb->index = markid;
  483. pb->icase = this->flags() & regbase::icase;
  484. this->m_paren_start = last_paren_start;
  485. //
  486. // restore the alternate insertion point:
  487. //
  488. this->m_alt_insert_point = last_alt_point;
  489. //
  490. // allow backrefs to this mark:
  491. //
  492. if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
  493. this->m_backrefs |= 1u << (markid - 1);
  494. return true;
  495. }
  496. template <class charT, class traits>
  497. bool basic_regex_parser<charT, traits>::parse_basic_escape()
  498. {
  499. ++m_position;
  500. bool result = true;
  501. switch(this->m_traits.escape_syntax_type(*m_position))
  502. {
  503. case regex_constants::syntax_open_mark:
  504. return parse_open_paren();
  505. case regex_constants::syntax_close_mark:
  506. return false;
  507. case regex_constants::syntax_plus:
  508. if(this->flags() & regex_constants::bk_plus_qm)
  509. {
  510. ++m_position;
  511. return parse_repeat(1);
  512. }
  513. else
  514. return parse_literal();
  515. case regex_constants::syntax_question:
  516. if(this->flags() & regex_constants::bk_plus_qm)
  517. {
  518. ++m_position;
  519. return parse_repeat(0, 1);
  520. }
  521. else
  522. return parse_literal();
  523. case regex_constants::syntax_open_brace:
  524. if(this->flags() & regbase::no_intervals)
  525. return parse_literal();
  526. ++m_position;
  527. return parse_repeat_range(true);
  528. case regex_constants::syntax_close_brace:
  529. if(this->flags() & regbase::no_intervals)
  530. return parse_literal();
  531. fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
  532. return false;
  533. case regex_constants::syntax_or:
  534. if(this->flags() & regbase::bk_vbar)
  535. return parse_alt();
  536. else
  537. result = parse_literal();
  538. break;
  539. case regex_constants::syntax_digit:
  540. return parse_backref();
  541. case regex_constants::escape_type_start_buffer:
  542. if(this->flags() & regbase::emacs_ex)
  543. {
  544. ++m_position;
  545. this->append_state(syntax_element_buffer_start);
  546. }
  547. else
  548. result = parse_literal();
  549. break;
  550. case regex_constants::escape_type_end_buffer:
  551. if(this->flags() & regbase::emacs_ex)
  552. {
  553. ++m_position;
  554. this->append_state(syntax_element_buffer_end);
  555. }
  556. else
  557. result = parse_literal();
  558. break;
  559. case regex_constants::escape_type_word_assert:
  560. if(this->flags() & regbase::emacs_ex)
  561. {
  562. ++m_position;
  563. this->append_state(syntax_element_word_boundary);
  564. }
  565. else
  566. result = parse_literal();
  567. break;
  568. case regex_constants::escape_type_not_word_assert:
  569. if(this->flags() & regbase::emacs_ex)
  570. {
  571. ++m_position;
  572. this->append_state(syntax_element_within_word);
  573. }
  574. else
  575. result = parse_literal();
  576. break;
  577. case regex_constants::escape_type_left_word:
  578. if(this->flags() & regbase::emacs_ex)
  579. {
  580. ++m_position;
  581. this->append_state(syntax_element_word_start);
  582. }
  583. else
  584. result = parse_literal();
  585. break;
  586. case regex_constants::escape_type_right_word:
  587. if(this->flags() & regbase::emacs_ex)
  588. {
  589. ++m_position;
  590. this->append_state(syntax_element_word_end);
  591. }
  592. else
  593. result = parse_literal();
  594. break;
  595. default:
  596. if(this->flags() & regbase::emacs_ex)
  597. {
  598. bool negate = true;
  599. switch(*m_position)
  600. {
  601. case 'w':
  602. negate = false;
  603. BOOST_FALLTHROUGH;
  604. case 'W':
  605. {
  606. basic_char_set<charT, traits> char_set;
  607. if(negate)
  608. char_set.negate();
  609. char_set.add_class(this->m_word_mask);
  610. if(0 == this->append_set(char_set))
  611. {
  612. fail(regex_constants::error_ctype, m_position - m_base);
  613. return false;
  614. }
  615. ++m_position;
  616. return true;
  617. }
  618. case 's':
  619. negate = false;
  620. BOOST_FALLTHROUGH;
  621. case 'S':
  622. return add_emacs_code(negate);
  623. case 'c':
  624. case 'C':
  625. // not supported yet:
  626. fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
  627. return false;
  628. default:
  629. break;
  630. }
  631. }
  632. result = parse_literal();
  633. break;
  634. }
  635. return result;
  636. }
  637. template <class charT, class traits>
  638. bool basic_regex_parser<charT, traits>::parse_extended_escape()
  639. {
  640. ++m_position;
  641. if(m_position == m_end)
  642. {
  643. fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
  644. return false;
  645. }
  646. bool negate = false; // in case this is a character class escape: \w \d etc
  647. switch(this->m_traits.escape_syntax_type(*m_position))
  648. {
  649. case regex_constants::escape_type_not_class:
  650. negate = true;
  651. BOOST_FALLTHROUGH;
  652. case regex_constants::escape_type_class:
  653. {
  654. escape_type_class_jump:
  655. typedef typename traits::char_class_type m_type;
  656. m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  657. if(m != 0)
  658. {
  659. basic_char_set<charT, traits> char_set;
  660. if(negate)
  661. char_set.negate();
  662. char_set.add_class(m);
  663. if(0 == this->append_set(char_set))
  664. {
  665. fail(regex_constants::error_ctype, m_position - m_base);
  666. return false;
  667. }
  668. ++m_position;
  669. return true;
  670. }
  671. //
  672. // not a class, just a regular unknown escape:
  673. //
  674. this->append_literal(unescape_character());
  675. break;
  676. }
  677. case regex_constants::syntax_digit:
  678. return parse_backref();
  679. case regex_constants::escape_type_left_word:
  680. ++m_position;
  681. this->append_state(syntax_element_word_start);
  682. break;
  683. case regex_constants::escape_type_right_word:
  684. ++m_position;
  685. this->append_state(syntax_element_word_end);
  686. break;
  687. case regex_constants::escape_type_start_buffer:
  688. ++m_position;
  689. this->append_state(syntax_element_buffer_start);
  690. break;
  691. case regex_constants::escape_type_end_buffer:
  692. ++m_position;
  693. this->append_state(syntax_element_buffer_end);
  694. break;
  695. case regex_constants::escape_type_word_assert:
  696. ++m_position;
  697. this->append_state(syntax_element_word_boundary);
  698. break;
  699. case regex_constants::escape_type_not_word_assert:
  700. ++m_position;
  701. this->append_state(syntax_element_within_word);
  702. break;
  703. case regex_constants::escape_type_Z:
  704. ++m_position;
  705. this->append_state(syntax_element_soft_buffer_end);
  706. break;
  707. case regex_constants::escape_type_Q:
  708. return parse_QE();
  709. case regex_constants::escape_type_C:
  710. return parse_match_any();
  711. case regex_constants::escape_type_X:
  712. ++m_position;
  713. this->append_state(syntax_element_combining);
  714. break;
  715. case regex_constants::escape_type_G:
  716. ++m_position;
  717. this->append_state(syntax_element_restart_continue);
  718. break;
  719. case regex_constants::escape_type_not_property:
  720. negate = true;
  721. BOOST_FALLTHROUGH;
  722. case regex_constants::escape_type_property:
  723. {
  724. ++m_position;
  725. char_class_type m;
  726. if(m_position == m_end)
  727. {
  728. fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
  729. return false;
  730. }
  731. // maybe have \p{ddd}
  732. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  733. {
  734. const charT* base = m_position;
  735. // skip forward until we find enclosing brace:
  736. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  737. ++m_position;
  738. if(m_position == m_end)
  739. {
  740. fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
  741. return false;
  742. }
  743. m = this->m_traits.lookup_classname(++base, m_position++);
  744. }
  745. else
  746. {
  747. m = this->m_traits.lookup_classname(m_position, m_position+1);
  748. ++m_position;
  749. }
  750. if(m != 0)
  751. {
  752. basic_char_set<charT, traits> char_set;
  753. if(negate)
  754. char_set.negate();
  755. char_set.add_class(m);
  756. if(0 == this->append_set(char_set))
  757. {
  758. fail(regex_constants::error_ctype, m_position - m_base);
  759. return false;
  760. }
  761. return true;
  762. }
  763. fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
  764. return false;
  765. }
  766. case regex_constants::escape_type_reset_start_mark:
  767. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  768. {
  769. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  770. pb->index = -5;
  771. pb->icase = this->flags() & regbase::icase;
  772. this->m_pdata->m_data.align();
  773. ++m_position;
  774. return true;
  775. }
  776. goto escape_type_class_jump;
  777. case regex_constants::escape_type_line_ending:
  778. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  779. {
  780. const charT* e = get_escape_R_string<charT>();
  781. const charT* old_position = m_position;
  782. const charT* old_end = m_end;
  783. const charT* old_base = m_base;
  784. m_position = e;
  785. m_base = e;
  786. m_end = e + traits::length(e);
  787. bool r = parse_all();
  788. m_position = ++old_position;
  789. m_end = old_end;
  790. m_base = old_base;
  791. return r;
  792. }
  793. goto escape_type_class_jump;
  794. case regex_constants::escape_type_extended_backref:
  795. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  796. {
  797. bool have_brace = false;
  798. bool negative = false;
  799. static const char* incomplete_message = "Incomplete \\g escape found.";
  800. if(++m_position == m_end)
  801. {
  802. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  803. return false;
  804. }
  805. // maybe have \g{ddd}
  806. regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
  807. regex_constants::syntax_type syn_end = 0;
  808. if((syn == regex_constants::syntax_open_brace)
  809. || (syn == regex_constants::escape_type_left_word)
  810. || (syn == regex_constants::escape_type_end_buffer))
  811. {
  812. if(++m_position == m_end)
  813. {
  814. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  815. return false;
  816. }
  817. have_brace = true;
  818. switch(syn)
  819. {
  820. case regex_constants::syntax_open_brace:
  821. syn_end = regex_constants::syntax_close_brace;
  822. break;
  823. case regex_constants::escape_type_left_word:
  824. syn_end = regex_constants::escape_type_right_word;
  825. break;
  826. default:
  827. syn_end = regex_constants::escape_type_end_buffer;
  828. break;
  829. }
  830. }
  831. negative = (*m_position == static_cast<charT>('-'));
  832. if((negative) && (++m_position == m_end))
  833. {
  834. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  835. return false;
  836. }
  837. const charT* pc = m_position;
  838. int i = this->m_traits.toi(pc, m_end, 10);
  839. if((i < 0) && syn_end)
  840. {
  841. // Check for a named capture, get the leftmost one if there is more than one:
  842. const charT* base = m_position;
  843. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
  844. {
  845. ++m_position;
  846. }
  847. i = hash_value_from_capture_name(base, m_position);
  848. pc = m_position;
  849. }
  850. if(negative)
  851. i = 1 + m_mark_count - i;
  852. if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
  853. {
  854. m_position = pc;
  855. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
  856. pb->index = i;
  857. pb->icase = this->flags() & regbase::icase;
  858. }
  859. else
  860. {
  861. fail(regex_constants::error_backref, m_position - m_base);
  862. return false;
  863. }
  864. m_position = pc;
  865. if(have_brace)
  866. {
  867. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
  868. {
  869. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  870. return false;
  871. }
  872. ++m_position;
  873. }
  874. return true;
  875. }
  876. goto escape_type_class_jump;
  877. case regex_constants::escape_type_control_v:
  878. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  879. goto escape_type_class_jump;
  880. BOOST_FALLTHROUGH;
  881. default:
  882. this->append_literal(unescape_character());
  883. break;
  884. }
  885. return true;
  886. }
  887. template <class charT, class traits>
  888. bool basic_regex_parser<charT, traits>::parse_match_any()
  889. {
  890. //
  891. // we have a '.' that can match any character:
  892. //
  893. ++m_position;
  894. static_cast<re_dot*>(
  895. this->append_state(syntax_element_wild, sizeof(re_dot))
  896. )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
  897. ? re_detail::force_not_newline
  898. : this->flags() & regbase::mod_s ?
  899. re_detail::force_newline : re_detail::dont_care);
  900. return true;
  901. }
  902. template <class charT, class traits>
  903. bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
  904. {
  905. bool greedy = true;
  906. bool pocessive = false;
  907. std::size_t insert_point;
  908. //
  909. // when we get to here we may have a non-greedy ? mark still to come:
  910. //
  911. if((m_position != m_end)
  912. && (
  913. (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  914. || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
  915. )
  916. )
  917. {
  918. // OK we have a perl or emacs regex, check for a '?':
  919. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
  920. {
  921. greedy = false;
  922. ++m_position;
  923. }
  924. // for perl regexes only check for pocessive ++ repeats.
  925. if((m_position != m_end)
  926. && (0 == (this->flags() & regbase::main_option_type))
  927. && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
  928. {
  929. pocessive = true;
  930. ++m_position;
  931. }
  932. }
  933. if(0 == this->m_last_state)
  934. {
  935. fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position), "Nothing to repeat.");
  936. return false;
  937. }
  938. if(this->m_last_state->type == syntax_element_endmark)
  939. {
  940. // insert a repeat before the '(' matching the last ')':
  941. insert_point = this->m_paren_start;
  942. }
  943. else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
  944. {
  945. // the last state was a literal with more than one character, split it in two:
  946. re_literal* lit = static_cast<re_literal*>(this->m_last_state);
  947. charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
  948. --(lit->length);
  949. // now append new state:
  950. lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
  951. lit->length = 1;
  952. (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
  953. insert_point = this->getoffset(this->m_last_state);
  954. }
  955. else
  956. {
  957. // repeat the last state whatever it was, need to add some error checking here:
  958. switch(this->m_last_state->type)
  959. {
  960. case syntax_element_start_line:
  961. case syntax_element_end_line:
  962. case syntax_element_word_boundary:
  963. case syntax_element_within_word:
  964. case syntax_element_word_start:
  965. case syntax_element_word_end:
  966. case syntax_element_buffer_start:
  967. case syntax_element_buffer_end:
  968. case syntax_element_alt:
  969. case syntax_element_soft_buffer_end:
  970. case syntax_element_restart_continue:
  971. case syntax_element_jump:
  972. case syntax_element_startmark:
  973. case syntax_element_backstep:
  974. // can't legally repeat any of the above:
  975. fail(regex_constants::error_badrepeat, m_position - m_base);
  976. return false;
  977. default:
  978. // do nothing...
  979. break;
  980. }
  981. insert_point = this->getoffset(this->m_last_state);
  982. }
  983. //
  984. // OK we now know what to repeat, so insert the repeat around it:
  985. //
  986. re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
  987. rep->min = low;
  988. rep->max = high;
  989. rep->greedy = greedy;
  990. rep->leading = false;
  991. // store our repeater position for later:
  992. std::ptrdiff_t rep_off = this->getoffset(rep);
  993. // and append a back jump to the repeat:
  994. re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
  995. jmp->alt.i = rep_off - this->getoffset(jmp);
  996. this->m_pdata->m_data.align();
  997. // now fill in the alt jump for the repeat:
  998. rep = static_cast<re_repeat*>(this->getaddress(rep_off));
  999. rep->alt.i = this->m_pdata->m_data.size() - rep_off;
  1000. //
  1001. // If the repeat is pocessive then bracket the repeat with a (?>...)
  1002. // independent sub-expression construct:
  1003. //
  1004. if(pocessive)
  1005. {
  1006. if(m_position != m_end)
  1007. {
  1008. //
  1009. // Check for illegal following quantifier, we have to do this here, because
  1010. // the extra states we insert below circumvents our usual error checking :-(
  1011. //
  1012. switch(this->m_traits.syntax_type(*m_position))
  1013. {
  1014. case regex_constants::syntax_star:
  1015. case regex_constants::syntax_plus:
  1016. case regex_constants::syntax_question:
  1017. case regex_constants::syntax_open_brace:
  1018. fail(regex_constants::error_badrepeat, m_position - m_base);
  1019. return false;
  1020. }
  1021. }
  1022. re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
  1023. pb->index = -3;
  1024. pb->icase = this->flags() & regbase::icase;
  1025. jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
  1026. this->m_pdata->m_data.align();
  1027. jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
  1028. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  1029. pb->index = -3;
  1030. pb->icase = this->flags() & regbase::icase;
  1031. }
  1032. return true;
  1033. }
  1034. template <class charT, class traits>
  1035. bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
  1036. {
  1037. static const char* incomplete_message = "Missing } in quantified repetition.";
  1038. //
  1039. // parse a repeat-range:
  1040. //
  1041. std::size_t min, max;
  1042. int v;
  1043. // skip whitespace:
  1044. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1045. ++m_position;
  1046. if(this->m_position == this->m_end)
  1047. {
  1048. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1049. {
  1050. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1051. return false;
  1052. }
  1053. // Treat the opening '{' as a literal character, rewind to start of error:
  1054. --m_position;
  1055. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1056. return parse_literal();
  1057. }
  1058. // get min:
  1059. v = this->m_traits.toi(m_position, m_end, 10);
  1060. // skip whitespace:
  1061. if(v < 0)
  1062. {
  1063. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1064. {
  1065. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1066. return false;
  1067. }
  1068. // Treat the opening '{' as a literal character, rewind to start of error:
  1069. --m_position;
  1070. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1071. return parse_literal();
  1072. }
  1073. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1074. ++m_position;
  1075. if(this->m_position == this->m_end)
  1076. {
  1077. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1078. {
  1079. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1080. return false;
  1081. }
  1082. // Treat the opening '{' as a literal character, rewind to start of error:
  1083. --m_position;
  1084. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1085. return parse_literal();
  1086. }
  1087. min = v;
  1088. // see if we have a comma:
  1089. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
  1090. {
  1091. // move on and error check:
  1092. ++m_position;
  1093. // skip whitespace:
  1094. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1095. ++m_position;
  1096. if(this->m_position == this->m_end)
  1097. {
  1098. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1099. {
  1100. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1101. return false;
  1102. }
  1103. // Treat the opening '{' as a literal character, rewind to start of error:
  1104. --m_position;
  1105. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1106. return parse_literal();
  1107. }
  1108. // get the value if any:
  1109. v = this->m_traits.toi(m_position, m_end, 10);
  1110. max = (v >= 0) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
  1111. }
  1112. else
  1113. {
  1114. // no comma, max = min:
  1115. max = min;
  1116. }
  1117. // skip whitespace:
  1118. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1119. ++m_position;
  1120. // OK now check trailing }:
  1121. if(this->m_position == this->m_end)
  1122. {
  1123. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1124. {
  1125. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1126. return false;
  1127. }
  1128. // Treat the opening '{' as a literal character, rewind to start of error:
  1129. --m_position;
  1130. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1131. return parse_literal();
  1132. }
  1133. if(isbasic)
  1134. {
  1135. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
  1136. {
  1137. ++m_position;
  1138. if(this->m_position == this->m_end)
  1139. {
  1140. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1141. return false;
  1142. }
  1143. }
  1144. else
  1145. {
  1146. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1147. return false;
  1148. }
  1149. }
  1150. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
  1151. ++m_position;
  1152. else
  1153. {
  1154. // Treat the opening '{' as a literal character, rewind to start of error:
  1155. --m_position;
  1156. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1157. return parse_literal();
  1158. }
  1159. //
  1160. // finally go and add the repeat, unless error:
  1161. //
  1162. if(min > max)
  1163. {
  1164. // Backtrack to error location:
  1165. m_position -= 2;
  1166. while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
  1167. ++m_position;
  1168. fail(regex_constants::error_badbrace, m_position - m_base);
  1169. return false;
  1170. }
  1171. return parse_repeat(min, max);
  1172. }
  1173. template <class charT, class traits>
  1174. bool basic_regex_parser<charT, traits>::parse_alt()
  1175. {
  1176. //
  1177. // error check: if there have been no previous states,
  1178. // or if the last state was a '(' then error:
  1179. //
  1180. if(
  1181. ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
  1182. &&
  1183. !(
  1184. ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
  1185. &&
  1186. ((this->flags() & regbase::no_empty_expressions) == 0)
  1187. )
  1188. )
  1189. {
  1190. fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression can start with the alternation operator |.");
  1191. return false;
  1192. }
  1193. //
  1194. // Reset mark count if required:
  1195. //
  1196. if(m_max_mark < m_mark_count)
  1197. m_max_mark = m_mark_count;
  1198. if(m_mark_reset >= 0)
  1199. m_mark_count = m_mark_reset;
  1200. ++m_position;
  1201. //
  1202. // we need to append a trailing jump:
  1203. //
  1204. re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump));
  1205. std::ptrdiff_t jump_offset = this->getoffset(pj);
  1206. //
  1207. // now insert the alternative:
  1208. //
  1209. re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
  1210. jump_offset += re_alt_size;
  1211. this->m_pdata->m_data.align();
  1212. palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
  1213. //
  1214. // update m_alt_insert_point so that the next alternate gets
  1215. // inserted at the start of the second of the two we've just created:
  1216. //
  1217. this->m_alt_insert_point = this->m_pdata->m_data.size();
  1218. //
  1219. // the start of this alternative must have a case changes state
  1220. // if the current block has messed around with case changes:
  1221. //
  1222. if(m_has_case_change)
  1223. {
  1224. static_cast<re_case*>(
  1225. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  1226. )->icase = this->m_icase;
  1227. }
  1228. //
  1229. // push the alternative onto our stack, a recursive
  1230. // implementation here is easier to understand (and faster
  1231. // as it happens), but causes all kinds of stack overflow problems
  1232. // on programs with small stacks (COM+).
  1233. //
  1234. m_alt_jumps.push_back(jump_offset);
  1235. return true;
  1236. }
  1237. template <class charT, class traits>
  1238. bool basic_regex_parser<charT, traits>::parse_set()
  1239. {
  1240. static const char* incomplete_message = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
  1241. ++m_position;
  1242. if(m_position == m_end)
  1243. {
  1244. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1245. return false;
  1246. }
  1247. basic_char_set<charT, traits> char_set;
  1248. const charT* base = m_position; // where the '[' was
  1249. const charT* item_base = m_position; // where the '[' or '^' was
  1250. while(m_position != m_end)
  1251. {
  1252. switch(this->m_traits.syntax_type(*m_position))
  1253. {
  1254. case regex_constants::syntax_caret:
  1255. if(m_position == base)
  1256. {
  1257. char_set.negate();
  1258. ++m_position;
  1259. item_base = m_position;
  1260. }
  1261. else
  1262. parse_set_literal(char_set);
  1263. break;
  1264. case regex_constants::syntax_close_set:
  1265. if(m_position == item_base)
  1266. {
  1267. parse_set_literal(char_set);
  1268. break;
  1269. }
  1270. else
  1271. {
  1272. ++m_position;
  1273. if(0 == this->append_set(char_set))
  1274. {
  1275. fail(regex_constants::error_ctype, m_position - m_base);
  1276. return false;
  1277. }
  1278. }
  1279. return true;
  1280. case regex_constants::syntax_open_set:
  1281. if(parse_inner_set(char_set))
  1282. break;
  1283. return true;
  1284. case regex_constants::syntax_escape:
  1285. {
  1286. //
  1287. // look ahead and see if this is a character class shortcut
  1288. // \d \w \s etc...
  1289. //
  1290. ++m_position;
  1291. if(this->m_traits.escape_syntax_type(*m_position)
  1292. == regex_constants::escape_type_class)
  1293. {
  1294. char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  1295. if(m != 0)
  1296. {
  1297. char_set.add_class(m);
  1298. ++m_position;
  1299. break;
  1300. }
  1301. }
  1302. else if(this->m_traits.escape_syntax_type(*m_position)
  1303. == regex_constants::escape_type_not_class)
  1304. {
  1305. // negated character class:
  1306. char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  1307. if(m != 0)
  1308. {
  1309. char_set.add_negated_class(m);
  1310. ++m_position;
  1311. break;
  1312. }
  1313. }
  1314. // not a character class, just a regular escape:
  1315. --m_position;
  1316. parse_set_literal(char_set);
  1317. break;
  1318. }
  1319. default:
  1320. parse_set_literal(char_set);
  1321. break;
  1322. }
  1323. }
  1324. return m_position != m_end;
  1325. }
  1326. template <class charT, class traits>
  1327. bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
  1328. {
  1329. static const char* incomplete_message = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
  1330. //
  1331. // we have either a character class [:name:]
  1332. // a collating element [.name.]
  1333. // or an equivalence class [=name=]
  1334. //
  1335. if(m_end == ++m_position)
  1336. {
  1337. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1338. return false;
  1339. }
  1340. switch(this->m_traits.syntax_type(*m_position))
  1341. {
  1342. case regex_constants::syntax_dot:
  1343. //
  1344. // a collating element is treated as a literal:
  1345. //
  1346. --m_position;
  1347. parse_set_literal(char_set);
  1348. return true;
  1349. case regex_constants::syntax_colon:
  1350. {
  1351. // check that character classes are actually enabled:
  1352. if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
  1353. == (regbase::basic_syntax_group | regbase::no_char_classes))
  1354. {
  1355. --m_position;
  1356. parse_set_literal(char_set);
  1357. return true;
  1358. }
  1359. // skip the ':'
  1360. if(m_end == ++m_position)
  1361. {
  1362. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1363. return false;
  1364. }
  1365. const charT* name_first = m_position;
  1366. // skip at least one character, then find the matching ':]'
  1367. if(m_end == ++m_position)
  1368. {
  1369. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1370. return false;
  1371. }
  1372. while((m_position != m_end)
  1373. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
  1374. ++m_position;
  1375. const charT* name_last = m_position;
  1376. if(m_end == m_position)
  1377. {
  1378. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1379. return false;
  1380. }
  1381. if((m_end == ++m_position)
  1382. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1383. {
  1384. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1385. return false;
  1386. }
  1387. //
  1388. // check for negated class:
  1389. //
  1390. bool negated = false;
  1391. if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
  1392. {
  1393. ++name_first;
  1394. negated = true;
  1395. }
  1396. typedef typename traits::char_class_type m_type;
  1397. m_type m = this->m_traits.lookup_classname(name_first, name_last);
  1398. if(m == 0)
  1399. {
  1400. if(char_set.empty() && (name_last - name_first == 1))
  1401. {
  1402. // maybe a special case:
  1403. ++m_position;
  1404. if( (m_position != m_end)
  1405. && (this->m_traits.syntax_type(*m_position)
  1406. == regex_constants::syntax_close_set))
  1407. {
  1408. if(this->m_traits.escape_syntax_type(*name_first)
  1409. == regex_constants::escape_type_left_word)
  1410. {
  1411. ++m_position;
  1412. this->append_state(syntax_element_word_start);
  1413. return false;
  1414. }
  1415. if(this->m_traits.escape_syntax_type(*name_first)
  1416. == regex_constants::escape_type_right_word)
  1417. {
  1418. ++m_position;
  1419. this->append_state(syntax_element_word_end);
  1420. return false;
  1421. }
  1422. }
  1423. }
  1424. fail(regex_constants::error_ctype, name_first - m_base);
  1425. return false;
  1426. }
  1427. if(negated == false)
  1428. char_set.add_class(m);
  1429. else
  1430. char_set.add_negated_class(m);
  1431. ++m_position;
  1432. break;
  1433. }
  1434. case regex_constants::syntax_equal:
  1435. {
  1436. // skip the '='
  1437. if(m_end == ++m_position)
  1438. {
  1439. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1440. return false;
  1441. }
  1442. const charT* name_first = m_position;
  1443. // skip at least one character, then find the matching '=]'
  1444. if(m_end == ++m_position)
  1445. {
  1446. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1447. return false;
  1448. }
  1449. while((m_position != m_end)
  1450. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
  1451. ++m_position;
  1452. const charT* name_last = m_position;
  1453. if(m_end == m_position)
  1454. {
  1455. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1456. return false;
  1457. }
  1458. if((m_end == ++m_position)
  1459. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1460. {
  1461. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1462. return false;
  1463. }
  1464. string_type m = this->m_traits.lookup_collatename(name_first, name_last);
  1465. if((0 == m.size()) || (m.size() > 2))
  1466. {
  1467. fail(regex_constants::error_collate, name_first - m_base);
  1468. return false;
  1469. }
  1470. digraph<charT> d;
  1471. d.first = m[0];
  1472. if(m.size() > 1)
  1473. d.second = m[1];
  1474. else
  1475. d.second = 0;
  1476. char_set.add_equivalent(d);
  1477. ++m_position;
  1478. break;
  1479. }
  1480. default:
  1481. --m_position;
  1482. parse_set_literal(char_set);
  1483. break;
  1484. }
  1485. return true;
  1486. }
  1487. template <class charT, class traits>
  1488. void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
  1489. {
  1490. digraph<charT> start_range(get_next_set_literal(char_set));
  1491. if(m_end == m_position)
  1492. {
  1493. fail(regex_constants::error_brack, m_position - m_base);
  1494. return;
  1495. }
  1496. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
  1497. {
  1498. // we have a range:
  1499. if(m_end == ++m_position)
  1500. {
  1501. fail(regex_constants::error_brack, m_position - m_base);
  1502. return;
  1503. }
  1504. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
  1505. {
  1506. digraph<charT> end_range = get_next_set_literal(char_set);
  1507. char_set.add_range(start_range, end_range);
  1508. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
  1509. {
  1510. if(m_end == ++m_position)
  1511. {
  1512. fail(regex_constants::error_brack, m_position - m_base);
  1513. return;
  1514. }
  1515. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
  1516. {
  1517. // trailing - :
  1518. --m_position;
  1519. return;
  1520. }
  1521. fail(regex_constants::error_range, m_position - m_base);
  1522. return;
  1523. }
  1524. return;
  1525. }
  1526. --m_position;
  1527. }
  1528. char_set.add_single(start_range);
  1529. }
  1530. template <class charT, class traits>
  1531. digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
  1532. {
  1533. digraph<charT> result;
  1534. switch(this->m_traits.syntax_type(*m_position))
  1535. {
  1536. case regex_constants::syntax_dash:
  1537. if(!char_set.empty())
  1538. {
  1539. // see if we are at the end of the set:
  1540. if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1541. {
  1542. fail(regex_constants::error_range, m_position - m_base);
  1543. return result;
  1544. }
  1545. --m_position;
  1546. }
  1547. result.first = *m_position++;
  1548. return result;
  1549. case regex_constants::syntax_escape:
  1550. // check to see if escapes are supported first:
  1551. if(this->flags() & regex_constants::no_escape_in_lists)
  1552. {
  1553. result = *m_position++;
  1554. break;
  1555. }
  1556. ++m_position;
  1557. result = unescape_character();
  1558. break;
  1559. case regex_constants::syntax_open_set:
  1560. {
  1561. if(m_end == ++m_position)
  1562. {
  1563. fail(regex_constants::error_collate, m_position - m_base);
  1564. return result;
  1565. }
  1566. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
  1567. {
  1568. --m_position;
  1569. result.first = *m_position;
  1570. ++m_position;
  1571. return result;
  1572. }
  1573. if(m_end == ++m_position)
  1574. {
  1575. fail(regex_constants::error_collate, m_position - m_base);
  1576. return result;
  1577. }
  1578. const charT* name_first = m_position;
  1579. // skip at least one character, then find the matching ':]'
  1580. if(m_end == ++m_position)
  1581. {
  1582. fail(regex_constants::error_collate, name_first - m_base);
  1583. return result;
  1584. }
  1585. while((m_position != m_end)
  1586. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
  1587. ++m_position;
  1588. const charT* name_last = m_position;
  1589. if(m_end == m_position)
  1590. {
  1591. fail(regex_constants::error_collate, name_first - m_base);
  1592. return result;
  1593. }
  1594. if((m_end == ++m_position)
  1595. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1596. {
  1597. fail(regex_constants::error_collate, name_first - m_base);
  1598. return result;
  1599. }
  1600. ++m_position;
  1601. string_type s = this->m_traits.lookup_collatename(name_first, name_last);
  1602. if(s.empty() || (s.size() > 2))
  1603. {
  1604. fail(regex_constants::error_collate, name_first - m_base);
  1605. return result;
  1606. }
  1607. result.first = s[0];
  1608. if(s.size() > 1)
  1609. result.second = s[1];
  1610. else
  1611. result.second = 0;
  1612. return result;
  1613. }
  1614. default:
  1615. result = *m_position++;
  1616. }
  1617. return result;
  1618. }
  1619. //
  1620. // does a value fit in the specified charT type?
  1621. //
  1622. template <class charT>
  1623. bool valid_value(charT, int v, const mpl::true_&)
  1624. {
  1625. return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
  1626. }
  1627. template <class charT>
  1628. bool valid_value(charT, int, const mpl::false_&)
  1629. {
  1630. return true; // v will alsways fit in a charT
  1631. }
  1632. template <class charT>
  1633. bool valid_value(charT c, int v)
  1634. {
  1635. return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(int))>());
  1636. }
  1637. template <class charT, class traits>
  1638. charT basic_regex_parser<charT, traits>::unescape_character()
  1639. {
  1640. #ifdef BOOST_MSVC
  1641. #pragma warning(push)
  1642. #pragma warning(disable:4127)
  1643. #endif
  1644. charT result(0);
  1645. if(m_position == m_end)
  1646. {
  1647. fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
  1648. return false;
  1649. }
  1650. switch(this->m_traits.escape_syntax_type(*m_position))
  1651. {
  1652. case regex_constants::escape_type_control_a:
  1653. result = charT('\a');
  1654. break;
  1655. case regex_constants::escape_type_e:
  1656. result = charT(27);
  1657. break;
  1658. case regex_constants::escape_type_control_f:
  1659. result = charT('\f');
  1660. break;
  1661. case regex_constants::escape_type_control_n:
  1662. result = charT('\n');
  1663. break;
  1664. case regex_constants::escape_type_control_r:
  1665. result = charT('\r');
  1666. break;
  1667. case regex_constants::escape_type_control_t:
  1668. result = charT('\t');
  1669. break;
  1670. case regex_constants::escape_type_control_v:
  1671. result = charT('\v');
  1672. break;
  1673. case regex_constants::escape_type_word_assert:
  1674. result = charT('\b');
  1675. break;
  1676. case regex_constants::escape_type_ascii_control:
  1677. ++m_position;
  1678. if(m_position == m_end)
  1679. {
  1680. // Rewind to start of escape:
  1681. --m_position;
  1682. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1683. fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
  1684. return result;
  1685. }
  1686. result = static_cast<charT>(*m_position % 32);
  1687. break;
  1688. case regex_constants::escape_type_hex:
  1689. ++m_position;
  1690. if(m_position == m_end)
  1691. {
  1692. // Rewind to start of escape:
  1693. --m_position;
  1694. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1695. fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
  1696. return result;
  1697. }
  1698. // maybe have \x{ddd}
  1699. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  1700. {
  1701. ++m_position;
  1702. if(m_position == m_end)
  1703. {
  1704. // Rewind to start of escape:
  1705. --m_position;
  1706. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1707. fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
  1708. return result;
  1709. }
  1710. int i = this->m_traits.toi(m_position, m_end, 16);
  1711. if((m_position == m_end)
  1712. || (i < 0)
  1713. || ((std::numeric_limits<charT>::is_specialized) && (i > (int)(std::numeric_limits<charT>::max)()))
  1714. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  1715. {
  1716. // Rewind to start of escape:
  1717. --m_position;
  1718. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1719. fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
  1720. return result;
  1721. }
  1722. ++m_position;
  1723. result = charT(i);
  1724. }
  1725. else
  1726. {
  1727. std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
  1728. int i = this->m_traits.toi(m_position, m_position + len, 16);
  1729. if((i < 0)
  1730. || !valid_value(charT(0), i))
  1731. {
  1732. // Rewind to start of escape:
  1733. --m_position;
  1734. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1735. fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
  1736. return result;
  1737. }
  1738. result = charT(i);
  1739. }
  1740. return result;
  1741. case regex_constants::syntax_digit:
  1742. {
  1743. // an octal escape sequence, the first character must be a zero
  1744. // followed by up to 3 octal digits:
  1745. std::ptrdiff_t len = (std::min)(::boost::re_detail::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
  1746. const charT* bp = m_position;
  1747. int val = this->m_traits.toi(bp, bp + 1, 8);
  1748. if(val != 0)
  1749. {
  1750. // Rewind to start of escape:
  1751. --m_position;
  1752. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1753. // Oops not an octal escape after all:
  1754. fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
  1755. return result;
  1756. }
  1757. val = this->m_traits.toi(m_position, m_position + len, 8);
  1758. if(val < 0)
  1759. {
  1760. // Rewind to start of escape:
  1761. --m_position;
  1762. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1763. fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
  1764. return result;
  1765. }
  1766. return static_cast<charT>(val);
  1767. }
  1768. case regex_constants::escape_type_named_char:
  1769. {
  1770. ++m_position;
  1771. if(m_position == m_end)
  1772. {
  1773. // Rewind to start of escape:
  1774. --m_position;
  1775. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1776. fail(regex_constants::error_escape, m_position - m_base);
  1777. return false;
  1778. }
  1779. // maybe have \N{name}
  1780. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  1781. {
  1782. const charT* base = m_position;
  1783. // skip forward until we find enclosing brace:
  1784. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  1785. ++m_position;
  1786. if(m_position == m_end)
  1787. {
  1788. // Rewind to start of escape:
  1789. --m_position;
  1790. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1791. fail(regex_constants::error_escape, m_position - m_base);
  1792. return false;
  1793. }
  1794. string_type s = this->m_traits.lookup_collatename(++base, m_position++);
  1795. if(s.empty())
  1796. {
  1797. // Rewind to start of escape:
  1798. --m_position;
  1799. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1800. fail(regex_constants::error_collate, m_position - m_base);
  1801. return false;
  1802. }
  1803. if(s.size() == 1)
  1804. {
  1805. return s[0];
  1806. }
  1807. }
  1808. // fall through is a failure:
  1809. // Rewind to start of escape:
  1810. --m_position;
  1811. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1812. fail(regex_constants::error_escape, m_position - m_base);
  1813. return false;
  1814. }
  1815. default:
  1816. result = *m_position;
  1817. break;
  1818. }
  1819. ++m_position;
  1820. return result;
  1821. #ifdef BOOST_MSVC
  1822. #pragma warning(pop)
  1823. #endif
  1824. }
  1825. template <class charT, class traits>
  1826. bool basic_regex_parser<charT, traits>::parse_backref()
  1827. {
  1828. BOOST_ASSERT(m_position != m_end);
  1829. const charT* pc = m_position;
  1830. int i = this->m_traits.toi(pc, pc + 1, 10);
  1831. if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
  1832. {
  1833. // not a backref at all but an octal escape sequence:
  1834. charT c = unescape_character();
  1835. this->append_literal(c);
  1836. }
  1837. else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
  1838. {
  1839. m_position = pc;
  1840. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
  1841. pb->index = i;
  1842. pb->icase = this->flags() & regbase::icase;
  1843. }
  1844. else
  1845. {
  1846. // Rewind to start of escape:
  1847. --m_position;
  1848. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1849. fail(regex_constants::error_backref, m_position - m_base);
  1850. return false;
  1851. }
  1852. return true;
  1853. }
  1854. template <class charT, class traits>
  1855. bool basic_regex_parser<charT, traits>::parse_QE()
  1856. {
  1857. #ifdef BOOST_MSVC
  1858. #pragma warning(push)
  1859. #pragma warning(disable:4127)
  1860. #endif
  1861. //
  1862. // parse a \Q...\E sequence:
  1863. //
  1864. ++m_position; // skip the Q
  1865. const charT* start = m_position;
  1866. const charT* end;
  1867. do
  1868. {
  1869. while((m_position != m_end)
  1870. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
  1871. ++m_position;
  1872. if(m_position == m_end)
  1873. {
  1874. // a \Q...\E sequence may terminate with the end of the expression:
  1875. end = m_position;
  1876. break;
  1877. }
  1878. if(++m_position == m_end) // skip the escape
  1879. {
  1880. fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
  1881. return false;
  1882. }
  1883. // check to see if it's a \E:
  1884. if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
  1885. {
  1886. ++m_position;
  1887. end = m_position - 2;
  1888. break;
  1889. }
  1890. // otherwise go round again:
  1891. }while(true);
  1892. //
  1893. // now add all the character between the two escapes as literals:
  1894. //
  1895. while(start != end)
  1896. {
  1897. this->append_literal(*start);
  1898. ++start;
  1899. }
  1900. return true;
  1901. #ifdef BOOST_MSVC
  1902. #pragma warning(pop)
  1903. #endif
  1904. }
  1905. template <class charT, class traits>
  1906. bool basic_regex_parser<charT, traits>::parse_perl_extension()
  1907. {
  1908. if(++m_position == m_end)
  1909. {
  1910. // Rewind to start of (? sequence:
  1911. --m_position;
  1912. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  1913. fail(regex_constants::error_perl_extension, m_position - m_base);
  1914. return false;
  1915. }
  1916. //
  1917. // treat comments as a special case, as these
  1918. // are the only ones that don't start with a leading
  1919. // startmark state:
  1920. //
  1921. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
  1922. {
  1923. while((m_position != m_end)
  1924. && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
  1925. {}
  1926. return true;
  1927. }
  1928. //
  1929. // backup some state, and prepare the way:
  1930. //
  1931. int markid = 0;
  1932. std::ptrdiff_t jump_offset = 0;
  1933. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  1934. pb->icase = this->flags() & regbase::icase;
  1935. std::ptrdiff_t last_paren_start = this->getoffset(pb);
  1936. // back up insertion point for alternations, and set new point:
  1937. std::ptrdiff_t last_alt_point = m_alt_insert_point;
  1938. this->m_pdata->m_data.align();
  1939. m_alt_insert_point = this->m_pdata->m_data.size();
  1940. std::ptrdiff_t expected_alt_point = m_alt_insert_point;
  1941. bool restore_flags = true;
  1942. regex_constants::syntax_option_type old_flags = this->flags();
  1943. bool old_case_change = m_has_case_change;
  1944. m_has_case_change = false;
  1945. charT name_delim;
  1946. int mark_reset = m_mark_reset;
  1947. int max_mark = m_max_mark;
  1948. m_mark_reset = -1;
  1949. m_max_mark = m_mark_count;
  1950. int v;
  1951. //
  1952. // select the actual extension used:
  1953. //
  1954. switch(this->m_traits.syntax_type(*m_position))
  1955. {
  1956. case regex_constants::syntax_or:
  1957. m_mark_reset = m_mark_count;
  1958. BOOST_FALLTHROUGH;
  1959. case regex_constants::syntax_colon:
  1960. //
  1961. // a non-capturing mark:
  1962. //
  1963. pb->index = markid = 0;
  1964. ++m_position;
  1965. break;
  1966. case regex_constants::syntax_digit:
  1967. {
  1968. //
  1969. // a recursive subexpression:
  1970. //
  1971. v = this->m_traits.toi(m_position, m_end, 10);
  1972. if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  1973. {
  1974. // Rewind to start of (? sequence:
  1975. --m_position;
  1976. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  1977. fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
  1978. return false;
  1979. }
  1980. insert_recursion:
  1981. pb->index = markid = 0;
  1982. re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
  1983. pr->alt.i = v;
  1984. pr->state_id = 0;
  1985. static_cast<re_case*>(
  1986. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  1987. )->icase = this->flags() & regbase::icase;
  1988. break;
  1989. }
  1990. case regex_constants::syntax_plus:
  1991. //
  1992. // A forward-relative recursive subexpression:
  1993. //
  1994. ++m_position;
  1995. v = this->m_traits.toi(m_position, m_end, 10);
  1996. if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  1997. {
  1998. // Rewind to start of (? sequence:
  1999. --m_position;
  2000. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2001. fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
  2002. return false;
  2003. }
  2004. v += m_mark_count;
  2005. goto insert_recursion;
  2006. case regex_constants::syntax_dash:
  2007. //
  2008. // Possibly a backward-relative recursive subexpression:
  2009. //
  2010. ++m_position;
  2011. v = this->m_traits.toi(m_position, m_end, 10);
  2012. if(v <= 0)
  2013. {
  2014. --m_position;
  2015. // Oops not a relative recursion at all, but a (?-imsx) group:
  2016. goto option_group_jump;
  2017. }
  2018. v = m_mark_count + 1 - v;
  2019. if(v <= 0)
  2020. {
  2021. // Rewind to start of (? sequence:
  2022. --m_position;
  2023. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2024. fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
  2025. return false;
  2026. }
  2027. goto insert_recursion;
  2028. case regex_constants::syntax_equal:
  2029. pb->index = markid = -1;
  2030. ++m_position;
  2031. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2032. this->m_pdata->m_data.align();
  2033. m_alt_insert_point = this->m_pdata->m_data.size();
  2034. break;
  2035. case regex_constants::syntax_not:
  2036. pb->index = markid = -2;
  2037. ++m_position;
  2038. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2039. this->m_pdata->m_data.align();
  2040. m_alt_insert_point = this->m_pdata->m_data.size();
  2041. break;
  2042. case regex_constants::escape_type_left_word:
  2043. {
  2044. // a lookbehind assertion:
  2045. if(++m_position == m_end)
  2046. {
  2047. // Rewind to start of (? sequence:
  2048. --m_position;
  2049. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2050. fail(regex_constants::error_perl_extension, m_position - m_base);
  2051. return false;
  2052. }
  2053. regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
  2054. if(t == regex_constants::syntax_not)
  2055. pb->index = markid = -2;
  2056. else if(t == regex_constants::syntax_equal)
  2057. pb->index = markid = -1;
  2058. else
  2059. {
  2060. // Probably a named capture which also starts (?< :
  2061. name_delim = '>';
  2062. --m_position;
  2063. goto named_capture_jump;
  2064. }
  2065. ++m_position;
  2066. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2067. this->append_state(syntax_element_backstep, sizeof(re_brace));
  2068. this->m_pdata->m_data.align();
  2069. m_alt_insert_point = this->m_pdata->m_data.size();
  2070. break;
  2071. }
  2072. case regex_constants::escape_type_right_word:
  2073. //
  2074. // an independent sub-expression:
  2075. //
  2076. pb->index = markid = -3;
  2077. ++m_position;
  2078. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2079. this->m_pdata->m_data.align();
  2080. m_alt_insert_point = this->m_pdata->m_data.size();
  2081. break;
  2082. case regex_constants::syntax_open_mark:
  2083. {
  2084. // a conditional expression:
  2085. pb->index = markid = -4;
  2086. if(++m_position == m_end)
  2087. {
  2088. // Rewind to start of (? sequence:
  2089. --m_position;
  2090. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2091. fail(regex_constants::error_perl_extension, m_position - m_base);
  2092. return false;
  2093. }
  2094. v = this->m_traits.toi(m_position, m_end, 10);
  2095. if(m_position == m_end)
  2096. {
  2097. // Rewind to start of (? sequence:
  2098. --m_position;
  2099. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2100. fail(regex_constants::error_perl_extension, m_position - m_base);
  2101. return false;
  2102. }
  2103. if(*m_position == charT('R'))
  2104. {
  2105. if(++m_position == m_end)
  2106. {
  2107. // Rewind to start of (? sequence:
  2108. --m_position;
  2109. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2110. fail(regex_constants::error_perl_extension, m_position - m_base);
  2111. return false;
  2112. }
  2113. if(*m_position == charT('&'))
  2114. {
  2115. const charT* base = ++m_position;
  2116. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2117. ++m_position;
  2118. if(m_position == m_end)
  2119. {
  2120. // Rewind to start of (? sequence:
  2121. --m_position;
  2122. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2123. fail(regex_constants::error_perl_extension, m_position - m_base);
  2124. return false;
  2125. }
  2126. v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
  2127. }
  2128. else
  2129. {
  2130. v = -this->m_traits.toi(m_position, m_end, 10);
  2131. }
  2132. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2133. br->index = v < 0 ? (v - 1) : 0;
  2134. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2135. {
  2136. // Rewind to start of (? sequence:
  2137. --m_position;
  2138. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2139. fail(regex_constants::error_perl_extension, m_position - m_base);
  2140. return false;
  2141. }
  2142. if(++m_position == m_end)
  2143. {
  2144. // Rewind to start of (? sequence:
  2145. --m_position;
  2146. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2147. fail(regex_constants::error_perl_extension, m_position - m_base);
  2148. return false;
  2149. }
  2150. }
  2151. else if((*m_position == charT('\'')) || (*m_position == charT('<')))
  2152. {
  2153. const charT* base = ++m_position;
  2154. while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
  2155. ++m_position;
  2156. if(m_position == m_end)
  2157. {
  2158. // Rewind to start of (? sequence:
  2159. --m_position;
  2160. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2161. fail(regex_constants::error_perl_extension, m_position - m_base);
  2162. return false;
  2163. }
  2164. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2165. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2166. br->index = v;
  2167. if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
  2168. {
  2169. // Rewind to start of (? sequence:
  2170. --m_position;
  2171. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2172. fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
  2173. return false;
  2174. }
  2175. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2176. {
  2177. // Rewind to start of (? sequence:
  2178. --m_position;
  2179. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2180. fail(regex_constants::error_perl_extension, m_position - m_base);
  2181. return false;
  2182. }
  2183. if(++m_position == m_end)
  2184. {
  2185. // Rewind to start of (? sequence:
  2186. --m_position;
  2187. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2188. fail(regex_constants::error_perl_extension, m_position - m_base);
  2189. return false;
  2190. }
  2191. }
  2192. else if(*m_position == charT('D'))
  2193. {
  2194. const char* def = "DEFINE";
  2195. while(*def && (m_position != m_end) && (*m_position == charT(*def)))
  2196. ++m_position, ++def;
  2197. if((m_position == m_end) || *def)
  2198. {
  2199. // Rewind to start of (? sequence:
  2200. --m_position;
  2201. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2202. fail(regex_constants::error_perl_extension, m_position - m_base);
  2203. return false;
  2204. }
  2205. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2206. br->index = 9999; // special magic value!
  2207. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2208. {
  2209. // Rewind to start of (? sequence:
  2210. --m_position;
  2211. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2212. fail(regex_constants::error_perl_extension, m_position - m_base);
  2213. return false;
  2214. }
  2215. if(++m_position == m_end)
  2216. {
  2217. // Rewind to start of (? sequence:
  2218. --m_position;
  2219. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2220. fail(regex_constants::error_perl_extension, m_position - m_base);
  2221. return false;
  2222. }
  2223. }
  2224. else if(v > 0)
  2225. {
  2226. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2227. br->index = v;
  2228. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2229. {
  2230. // Rewind to start of (? sequence:
  2231. --m_position;
  2232. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2233. fail(regex_constants::error_perl_extension, m_position - m_base);
  2234. return false;
  2235. }
  2236. if(++m_position == m_end)
  2237. {
  2238. // Rewind to start of (? sequence:
  2239. --m_position;
  2240. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2241. fail(regex_constants::error_perl_extension, m_position - m_base);
  2242. return false;
  2243. }
  2244. }
  2245. else
  2246. {
  2247. // verify that we have a lookahead or lookbehind assert:
  2248. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
  2249. {
  2250. // Rewind to start of (? sequence:
  2251. --m_position;
  2252. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2253. fail(regex_constants::error_perl_extension, m_position - m_base);
  2254. return false;
  2255. }
  2256. if(++m_position == m_end)
  2257. {
  2258. // Rewind to start of (? sequence:
  2259. --m_position;
  2260. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2261. fail(regex_constants::error_perl_extension, m_position - m_base);
  2262. return false;
  2263. }
  2264. if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
  2265. {
  2266. if(++m_position == m_end)
  2267. {
  2268. // Rewind to start of (? sequence:
  2269. --m_position;
  2270. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2271. fail(regex_constants::error_perl_extension, m_position - m_base);
  2272. return false;
  2273. }
  2274. if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
  2275. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
  2276. {
  2277. // Rewind to start of (? sequence:
  2278. --m_position;
  2279. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2280. fail(regex_constants::error_perl_extension, m_position - m_base);
  2281. return false;
  2282. }
  2283. m_position -= 3;
  2284. }
  2285. else
  2286. {
  2287. if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
  2288. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
  2289. {
  2290. // Rewind to start of (? sequence:
  2291. --m_position;
  2292. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2293. fail(regex_constants::error_perl_extension, m_position - m_base);
  2294. return false;
  2295. }
  2296. m_position -= 2;
  2297. }
  2298. }
  2299. break;
  2300. }
  2301. case regex_constants::syntax_close_mark:
  2302. // Rewind to start of (? sequence:
  2303. --m_position;
  2304. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2305. fail(regex_constants::error_perl_extension, m_position - m_base);
  2306. return false;
  2307. case regex_constants::escape_type_end_buffer:
  2308. {
  2309. name_delim = *m_position;
  2310. named_capture_jump:
  2311. markid = 0;
  2312. if(0 == (this->flags() & regbase::nosubs))
  2313. {
  2314. markid = ++m_mark_count;
  2315. #ifndef BOOST_NO_STD_DISTANCE
  2316. if(this->flags() & regbase::save_subexpression_location)
  2317. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
  2318. #else
  2319. if(this->flags() & regbase::save_subexpression_location)
  2320. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
  2321. #endif
  2322. }
  2323. pb->index = markid;
  2324. const charT* base = ++m_position;
  2325. if(m_position == m_end)
  2326. {
  2327. // Rewind to start of (? sequence:
  2328. --m_position;
  2329. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2330. fail(regex_constants::error_perl_extension, m_position - m_base);
  2331. return false;
  2332. }
  2333. while((m_position != m_end) && (*m_position != name_delim))
  2334. ++m_position;
  2335. if(m_position == m_end)
  2336. {
  2337. // Rewind to start of (? sequence:
  2338. --m_position;
  2339. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2340. fail(regex_constants::error_perl_extension, m_position - m_base);
  2341. return false;
  2342. }
  2343. this->m_pdata->set_name(base, m_position, markid);
  2344. ++m_position;
  2345. break;
  2346. }
  2347. default:
  2348. if(*m_position == charT('R'))
  2349. {
  2350. ++m_position;
  2351. v = 0;
  2352. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2353. {
  2354. // Rewind to start of (? sequence:
  2355. --m_position;
  2356. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2357. fail(regex_constants::error_perl_extension, m_position - m_base);
  2358. return false;
  2359. }
  2360. goto insert_recursion;
  2361. }
  2362. if(*m_position == charT('&'))
  2363. {
  2364. ++m_position;
  2365. const charT* base = m_position;
  2366. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2367. ++m_position;
  2368. if(m_position == m_end)
  2369. {
  2370. // Rewind to start of (? sequence:
  2371. --m_position;
  2372. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2373. fail(regex_constants::error_perl_extension, m_position - m_base);
  2374. return false;
  2375. }
  2376. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2377. goto insert_recursion;
  2378. }
  2379. if(*m_position == charT('P'))
  2380. {
  2381. ++m_position;
  2382. if(m_position == m_end)
  2383. {
  2384. // Rewind to start of (? sequence:
  2385. --m_position;
  2386. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2387. fail(regex_constants::error_perl_extension, m_position - m_base);
  2388. return false;
  2389. }
  2390. if(*m_position == charT('>'))
  2391. {
  2392. ++m_position;
  2393. const charT* base = m_position;
  2394. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2395. ++m_position;
  2396. if(m_position == m_end)
  2397. {
  2398. // Rewind to start of (? sequence:
  2399. --m_position;
  2400. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2401. fail(regex_constants::error_perl_extension, m_position - m_base);
  2402. return false;
  2403. }
  2404. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2405. goto insert_recursion;
  2406. }
  2407. }
  2408. //
  2409. // lets assume that we have a (?imsx) group and try and parse it:
  2410. //
  2411. option_group_jump:
  2412. regex_constants::syntax_option_type opts = parse_options();
  2413. if(m_position == m_end)
  2414. {
  2415. // Rewind to start of (? sequence:
  2416. --m_position;
  2417. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2418. fail(regex_constants::error_perl_extension, m_position - m_base);
  2419. return false;
  2420. }
  2421. // make a note of whether we have a case change:
  2422. m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
  2423. pb->index = markid = 0;
  2424. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
  2425. {
  2426. // update flags and carry on as normal:
  2427. this->flags(opts);
  2428. restore_flags = false;
  2429. old_case_change |= m_has_case_change; // defer end of scope by one ')'
  2430. }
  2431. else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
  2432. {
  2433. // update flags and carry on until the matching ')' is found:
  2434. this->flags(opts);
  2435. ++m_position;
  2436. }
  2437. else
  2438. {
  2439. // Rewind to start of (? sequence:
  2440. --m_position;
  2441. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2442. fail(regex_constants::error_perl_extension, m_position - m_base);
  2443. return false;
  2444. }
  2445. // finally append a case change state if we need it:
  2446. if(m_has_case_change)
  2447. {
  2448. static_cast<re_case*>(
  2449. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  2450. )->icase = opts & regbase::icase;
  2451. }
  2452. }
  2453. //
  2454. // now recursively add more states, this will terminate when we get to a
  2455. // matching ')' :
  2456. //
  2457. parse_all();
  2458. //
  2459. // Unwind alternatives:
  2460. //
  2461. if(0 == unwind_alts(last_paren_start))
  2462. {
  2463. // Rewind to start of (? sequence:
  2464. --m_position;
  2465. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2466. fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
  2467. return false;
  2468. }
  2469. //
  2470. // we either have a ')' or we have run out of characters prematurely:
  2471. //
  2472. if(m_position == m_end)
  2473. {
  2474. // Rewind to start of (? sequence:
  2475. --m_position;
  2476. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2477. this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
  2478. return false;
  2479. }
  2480. BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
  2481. ++m_position;
  2482. //
  2483. // restore the flags:
  2484. //
  2485. if(restore_flags)
  2486. {
  2487. // append a case change state if we need it:
  2488. if(m_has_case_change)
  2489. {
  2490. static_cast<re_case*>(
  2491. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  2492. )->icase = old_flags & regbase::icase;
  2493. }
  2494. this->flags(old_flags);
  2495. }
  2496. //
  2497. // set up the jump pointer if we have one:
  2498. //
  2499. if(jump_offset)
  2500. {
  2501. this->m_pdata->m_data.align();
  2502. re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
  2503. jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
  2504. if((this->m_last_state == jmp) && (markid != -2))
  2505. {
  2506. // Oops... we didn't have anything inside the assertion.
  2507. // Note we don't get here for negated forward lookahead as (?!)
  2508. // does have some uses.
  2509. // Rewind to start of (? sequence:
  2510. --m_position;
  2511. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2512. fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
  2513. return false;
  2514. }
  2515. }
  2516. //
  2517. // verify that if this is conditional expression, that we do have
  2518. // an alternative, if not add one:
  2519. //
  2520. if(markid == -4)
  2521. {
  2522. re_syntax_base* b = this->getaddress(expected_alt_point);
  2523. // Make sure we have exactly one alternative following this state:
  2524. if(b->type != syntax_element_alt)
  2525. {
  2526. re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
  2527. alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
  2528. }
  2529. else if(this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
  2530. {
  2531. // Can't have seen more than one alternative:
  2532. // Rewind to start of (? sequence:
  2533. --m_position;
  2534. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2535. fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
  2536. return false;
  2537. }
  2538. else
  2539. {
  2540. // We must *not* have seen an alternative inside a (DEFINE) block:
  2541. b = this->getaddress(b->next.i, b);
  2542. if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
  2543. {
  2544. // Rewind to start of (? sequence:
  2545. --m_position;
  2546. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2547. fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
  2548. return false;
  2549. }
  2550. }
  2551. // check for invalid repetition of next state:
  2552. b = this->getaddress(expected_alt_point);
  2553. b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
  2554. if((b->type != syntax_element_assert_backref)
  2555. && (b->type != syntax_element_startmark))
  2556. {
  2557. // Rewind to start of (? sequence:
  2558. --m_position;
  2559. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2560. fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
  2561. return false;
  2562. }
  2563. }
  2564. //
  2565. // append closing parenthesis state:
  2566. //
  2567. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  2568. pb->index = markid;
  2569. pb->icase = this->flags() & regbase::icase;
  2570. this->m_paren_start = last_paren_start;
  2571. //
  2572. // restore the alternate insertion point:
  2573. //
  2574. this->m_alt_insert_point = last_alt_point;
  2575. //
  2576. // and the case change data:
  2577. //
  2578. m_has_case_change = old_case_change;
  2579. //
  2580. // And the mark_reset data:
  2581. //
  2582. if(m_max_mark > m_mark_count)
  2583. {
  2584. m_mark_count = m_max_mark;
  2585. }
  2586. m_mark_reset = mark_reset;
  2587. m_max_mark = max_mark;
  2588. if(markid > 0)
  2589. {
  2590. #ifndef BOOST_NO_STD_DISTANCE
  2591. if(this->flags() & regbase::save_subexpression_location)
  2592. this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1;
  2593. #else
  2594. if(this->flags() & regbase::save_subexpression_location)
  2595. this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
  2596. #endif
  2597. //
  2598. // allow backrefs to this mark:
  2599. //
  2600. if((markid > 0) && (markid < (int)(sizeof(unsigned) * CHAR_BIT)))
  2601. this->m_backrefs |= 1u << (markid - 1);
  2602. }
  2603. return true;
  2604. }
  2605. template <class charT, class traits>
  2606. bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
  2607. {
  2608. //
  2609. // parses an emacs style \sx or \Sx construct.
  2610. //
  2611. if(++m_position == m_end)
  2612. {
  2613. // Rewind to start of sequence:
  2614. --m_position;
  2615. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  2616. fail(regex_constants::error_escape, m_position - m_base);
  2617. return false;
  2618. }
  2619. basic_char_set<charT, traits> char_set;
  2620. if(negate)
  2621. char_set.negate();
  2622. static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
  2623. switch(*m_position)
  2624. {
  2625. case 's':
  2626. case ' ':
  2627. char_set.add_class(this->m_mask_space);
  2628. break;
  2629. case 'w':
  2630. char_set.add_class(this->m_word_mask);
  2631. break;
  2632. case '_':
  2633. char_set.add_single(digraph<charT>(charT('$')));
  2634. char_set.add_single(digraph<charT>(charT('&')));
  2635. char_set.add_single(digraph<charT>(charT('*')));
  2636. char_set.add_single(digraph<charT>(charT('+')));
  2637. char_set.add_single(digraph<charT>(charT('-')));
  2638. char_set.add_single(digraph<charT>(charT('_')));
  2639. char_set.add_single(digraph<charT>(charT('<')));
  2640. char_set.add_single(digraph<charT>(charT('>')));
  2641. break;
  2642. case '.':
  2643. char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
  2644. break;
  2645. case '(':
  2646. char_set.add_single(digraph<charT>(charT('(')));
  2647. char_set.add_single(digraph<charT>(charT('[')));
  2648. char_set.add_single(digraph<charT>(charT('{')));
  2649. break;
  2650. case ')':
  2651. char_set.add_single(digraph<charT>(charT(')')));
  2652. char_set.add_single(digraph<charT>(charT(']')));
  2653. char_set.add_single(digraph<charT>(charT('}')));
  2654. break;
  2655. case '"':
  2656. char_set.add_single(digraph<charT>(charT('"')));
  2657. char_set.add_single(digraph<charT>(charT('\'')));
  2658. char_set.add_single(digraph<charT>(charT('`')));
  2659. break;
  2660. case '\'':
  2661. char_set.add_single(digraph<charT>(charT('\'')));
  2662. char_set.add_single(digraph<charT>(charT(',')));
  2663. char_set.add_single(digraph<charT>(charT('#')));
  2664. break;
  2665. case '<':
  2666. char_set.add_single(digraph<charT>(charT(';')));
  2667. break;
  2668. case '>':
  2669. char_set.add_single(digraph<charT>(charT('\n')));
  2670. char_set.add_single(digraph<charT>(charT('\f')));
  2671. break;
  2672. default:
  2673. fail(regex_constants::error_ctype, m_position - m_base);
  2674. return false;
  2675. }
  2676. if(0 == this->append_set(char_set))
  2677. {
  2678. fail(regex_constants::error_ctype, m_position - m_base);
  2679. return false;
  2680. }
  2681. ++m_position;
  2682. return true;
  2683. }
  2684. template <class charT, class traits>
  2685. regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
  2686. {
  2687. // we have a (?imsx-imsx) group, convert it into a set of flags:
  2688. regex_constants::syntax_option_type f = this->flags();
  2689. bool breakout = false;
  2690. do
  2691. {
  2692. switch(*m_position)
  2693. {
  2694. case 's':
  2695. f |= regex_constants::mod_s;
  2696. f &= ~regex_constants::no_mod_s;
  2697. break;
  2698. case 'm':
  2699. f &= ~regex_constants::no_mod_m;
  2700. break;
  2701. case 'i':
  2702. f |= regex_constants::icase;
  2703. break;
  2704. case 'x':
  2705. f |= regex_constants::mod_x;
  2706. break;
  2707. default:
  2708. breakout = true;
  2709. continue;
  2710. }
  2711. if(++m_position == m_end)
  2712. {
  2713. // Rewind to start of (? sequence:
  2714. --m_position;
  2715. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2716. fail(regex_constants::error_paren, m_position - m_base);
  2717. return false;
  2718. }
  2719. }
  2720. while(!breakout);
  2721. breakout = false;
  2722. if(*m_position == static_cast<charT>('-'))
  2723. {
  2724. if(++m_position == m_end)
  2725. {
  2726. // Rewind to start of (? sequence:
  2727. --m_position;
  2728. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2729. fail(regex_constants::error_paren, m_position - m_base);
  2730. return false;
  2731. }
  2732. do
  2733. {
  2734. switch(*m_position)
  2735. {
  2736. case 's':
  2737. f &= ~regex_constants::mod_s;
  2738. f |= regex_constants::no_mod_s;
  2739. break;
  2740. case 'm':
  2741. f |= regex_constants::no_mod_m;
  2742. break;
  2743. case 'i':
  2744. f &= ~regex_constants::icase;
  2745. break;
  2746. case 'x':
  2747. f &= ~regex_constants::mod_x;
  2748. break;
  2749. default:
  2750. breakout = true;
  2751. continue;
  2752. }
  2753. if(++m_position == m_end)
  2754. {
  2755. // Rewind to start of (? sequence:
  2756. --m_position;
  2757. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2758. fail(regex_constants::error_paren, m_position - m_base);
  2759. return false;
  2760. }
  2761. }
  2762. while(!breakout);
  2763. }
  2764. return f;
  2765. }
  2766. template <class charT, class traits>
  2767. bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
  2768. {
  2769. //
  2770. // If we didn't actually add any states after the last
  2771. // alternative then that's an error:
  2772. //
  2773. if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
  2774. && m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)
  2775. &&
  2776. !(
  2777. ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
  2778. &&
  2779. ((this->flags() & regbase::no_empty_expressions) == 0)
  2780. )
  2781. )
  2782. {
  2783. fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
  2784. return false;
  2785. }
  2786. //
  2787. // Fix up our alternatives:
  2788. //
  2789. while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
  2790. {
  2791. //
  2792. // fix up the jump to point to the end of the states
  2793. // that we've just added:
  2794. //
  2795. std::ptrdiff_t jump_offset = m_alt_jumps.back();
  2796. m_alt_jumps.pop_back();
  2797. this->m_pdata->m_data.align();
  2798. re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
  2799. BOOST_ASSERT(jmp->type == syntax_element_jump);
  2800. jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
  2801. }
  2802. return true;
  2803. }
  2804. #ifdef BOOST_MSVC
  2805. #pragma warning(pop)
  2806. #endif
  2807. } // namespace re_detail
  2808. } // namespace boost
  2809. #ifdef BOOST_MSVC
  2810. #pragma warning(push)
  2811. #pragma warning(disable: 4103)
  2812. #endif
  2813. #ifdef BOOST_HAS_ABI_HEADERS
  2814. # include BOOST_ABI_SUFFIX
  2815. #endif
  2816. #ifdef BOOST_MSVC
  2817. #pragma warning(pop)
  2818. #endif
  2819. #endif