boyer_moore.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. /*
  2. Copyright (c) Marshall Clow 2010-2012.
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. For more information, see http://www.boost.org
  6. */
  7. #ifndef BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
  8. #define BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
  9. #include <iterator> // for std::iterator_traits
  10. #include <boost/assert.hpp>
  11. #include <boost/static_assert.hpp>
  12. #include <boost/range/begin.hpp>
  13. #include <boost/range/end.hpp>
  14. #include <boost/utility/enable_if.hpp>
  15. #include <boost/type_traits/is_same.hpp>
  16. #include <boost/algorithm/searching/detail/bm_traits.hpp>
  17. #include <boost/algorithm/searching/detail/debugging.hpp>
  18. namespace boost { namespace algorithm {
  19. /*
  20. A templated version of the boyer-moore searching algorithm.
  21. References:
  22. http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
  23. http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
  24. Explanations:
  25. http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
  26. http://www.movsd.com/bm.htm
  27. http://www.cs.ucdavis.edu/~gusfield/cs224f09/bnotes.pdf
  28. The Boyer-Moore search algorithm uses two tables, a "bad character" table
  29. to tell how far to skip ahead when it hits a character that is not in the pattern,
  30. and a "good character" table to tell how far to skip ahead when it hits a
  31. mismatch on a character that _is_ in the pattern.
  32. Requirements:
  33. * Random access iterators
  34. * The two iterator types (patIter and corpusIter) must
  35. "point to" the same underlying type and be comparable.
  36. * Additional requirements may be imposed but the skip table, such as:
  37. ** Numeric type (array-based skip table)
  38. ** Hashable type (map-based skip table)
  39. */
  40. template <typename patIter, typename traits = detail::BM_traits<patIter> >
  41. class boyer_moore {
  42. typedef typename std::iterator_traits<patIter>::difference_type difference_type;
  43. public:
  44. boyer_moore ( patIter first, patIter last )
  45. : pat_first ( first ), pat_last ( last ),
  46. k_pattern_length ( std::distance ( pat_first, pat_last )),
  47. skip_ ( k_pattern_length, -1 ),
  48. suffix_ ( k_pattern_length + 1 )
  49. {
  50. this->build_skip_table ( first, last );
  51. this->build_suffix_table ( first, last );
  52. }
  53. ~boyer_moore () {}
  54. /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last )
  55. /// \brief Searches the corpus for the pattern that was passed into the constructor
  56. ///
  57. /// \param corpus_first The start of the data to search (Random Access Iterator)
  58. /// \param corpus_last One past the end of the data to search
  59. ///
  60. template <typename corpusIter>
  61. corpusIter operator () ( corpusIter corpus_first, corpusIter corpus_last ) const {
  62. BOOST_STATIC_ASSERT (( boost::is_same<
  63. typename std::iterator_traits<patIter>::value_type,
  64. typename std::iterator_traits<corpusIter>::value_type>::value ));
  65. if ( corpus_first == corpus_last ) return corpus_last; // if nothing to search, we didn't find it!
  66. if ( pat_first == pat_last ) return corpus_first; // empty pattern matches at start
  67. const difference_type k_corpus_length = std::distance ( corpus_first, corpus_last );
  68. // If the pattern is larger than the corpus, we can't find it!
  69. if ( k_corpus_length < k_pattern_length )
  70. return corpus_last;
  71. // Do the search
  72. return this->do_search ( corpus_first, corpus_last );
  73. }
  74. template <typename Range>
  75. typename boost::range_iterator<Range>::type operator () ( Range &r ) const {
  76. return (*this) (boost::begin(r), boost::end(r));
  77. }
  78. private:
  79. /// \cond DOXYGEN_HIDE
  80. patIter pat_first, pat_last;
  81. const difference_type k_pattern_length;
  82. typename traits::skip_table_t skip_;
  83. std::vector <difference_type> suffix_;
  84. /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
  85. /// \brief Searches the corpus for the pattern that was passed into the constructor
  86. ///
  87. /// \param corpus_first The start of the data to search (Random Access Iterator)
  88. /// \param corpus_last One past the end of the data to search
  89. /// \param p A predicate used for the search comparisons.
  90. ///
  91. template <typename corpusIter>
  92. corpusIter do_search ( corpusIter corpus_first, corpusIter corpus_last ) const {
  93. /* ---- Do the matching ---- */
  94. corpusIter curPos = corpus_first;
  95. const corpusIter lastPos = corpus_last - k_pattern_length;
  96. difference_type j, k, m;
  97. while ( curPos <= lastPos ) {
  98. /* while ( std::distance ( curPos, corpus_last ) >= k_pattern_length ) { */
  99. // Do we match right where we are?
  100. j = k_pattern_length;
  101. while ( pat_first [j-1] == curPos [j-1] ) {
  102. j--;
  103. // We matched - we're done!
  104. if ( j == 0 )
  105. return curPos;
  106. }
  107. // Since we didn't match, figure out how far to skip forward
  108. k = skip_ [ curPos [ j - 1 ]];
  109. m = j - k - 1;
  110. if ( k < j && m > suffix_ [ j ] )
  111. curPos += m;
  112. else
  113. curPos += suffix_ [ j ];
  114. }
  115. return corpus_last; // We didn't find anything
  116. }
  117. void build_skip_table ( patIter first, patIter last ) {
  118. for ( std::size_t i = 0; first != last; ++first, ++i )
  119. skip_.insert ( *first, i );
  120. }
  121. template<typename Iter, typename Container>
  122. void compute_bm_prefix ( Iter pat_first, Iter pat_last, Container &prefix ) {
  123. const std::size_t count = std::distance ( pat_first, pat_last );
  124. BOOST_ASSERT ( count > 0 );
  125. BOOST_ASSERT ( prefix.size () == count );
  126. prefix[0] = 0;
  127. std::size_t k = 0;
  128. for ( std::size_t i = 1; i < count; ++i ) {
  129. BOOST_ASSERT ( k < count );
  130. while ( k > 0 && ( pat_first[k] != pat_first[i] )) {
  131. BOOST_ASSERT ( k < count );
  132. k = prefix [ k - 1 ];
  133. }
  134. if ( pat_first[k] == pat_first[i] )
  135. k++;
  136. prefix [ i ] = k;
  137. }
  138. }
  139. void build_suffix_table ( patIter pat_first, patIter pat_last ) {
  140. const std::size_t count = (std::size_t) std::distance ( pat_first, pat_last );
  141. if ( count > 0 ) { // empty pattern
  142. std::vector<typename std::iterator_traits<patIter>::value_type> reversed(count);
  143. (void) std::reverse_copy ( pat_first, pat_last, reversed.begin ());
  144. std::vector<difference_type> prefix (count);
  145. compute_bm_prefix ( pat_first, pat_last, prefix );
  146. std::vector<difference_type> prefix_reversed (count);
  147. compute_bm_prefix ( reversed.begin (), reversed.end (), prefix_reversed );
  148. for ( std::size_t i = 0; i <= count; i++ )
  149. suffix_[i] = count - prefix [count-1];
  150. for ( std::size_t i = 0; i < count; i++ ) {
  151. const std::size_t j = count - prefix_reversed[i];
  152. const difference_type k = i - prefix_reversed[i] + 1;
  153. if (suffix_[j] > k)
  154. suffix_[j] = k;
  155. }
  156. }
  157. }
  158. /// \endcond
  159. };
  160. /* Two ranges as inputs gives us four possibilities; with 2,3,3,4 parameters
  161. Use a bit of TMP to disambiguate the 3-argument templates */
  162. /// \fn boyer_moore_search ( corpusIter corpus_first, corpusIter corpus_last,
  163. /// patIter pat_first, patIter pat_last )
  164. /// \brief Searches the corpus for the pattern.
  165. ///
  166. /// \param corpus_first The start of the data to search (Random Access Iterator)
  167. /// \param corpus_last One past the end of the data to search
  168. /// \param pat_first The start of the pattern to search for (Random Access Iterator)
  169. /// \param pat_last One past the end of the data to search for
  170. ///
  171. template <typename patIter, typename corpusIter>
  172. corpusIter boyer_moore_search (
  173. corpusIter corpus_first, corpusIter corpus_last,
  174. patIter pat_first, patIter pat_last )
  175. {
  176. boyer_moore<patIter> bm ( pat_first, pat_last );
  177. return bm ( corpus_first, corpus_last );
  178. }
  179. template <typename PatternRange, typename corpusIter>
  180. corpusIter boyer_moore_search (
  181. corpusIter corpus_first, corpusIter corpus_last, const PatternRange &pattern )
  182. {
  183. typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
  184. boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
  185. return bm ( corpus_first, corpus_last );
  186. }
  187. template <typename patIter, typename CorpusRange>
  188. typename boost::lazy_disable_if_c<
  189. boost::is_same<CorpusRange, patIter>::value, typename boost::range_iterator<CorpusRange> >
  190. ::type
  191. boyer_moore_search ( CorpusRange &corpus, patIter pat_first, patIter pat_last )
  192. {
  193. boyer_moore<patIter> bm ( pat_first, pat_last );
  194. return bm (boost::begin (corpus), boost::end (corpus));
  195. }
  196. template <typename PatternRange, typename CorpusRange>
  197. typename boost::range_iterator<CorpusRange>::type
  198. boyer_moore_search ( CorpusRange &corpus, const PatternRange &pattern )
  199. {
  200. typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
  201. boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
  202. return bm (boost::begin (corpus), boost::end (corpus));
  203. }
  204. // Creator functions -- take a pattern range, return an object
  205. template <typename Range>
  206. boost::algorithm::boyer_moore<typename boost::range_iterator<const Range>::type>
  207. make_boyer_moore ( const Range &r ) {
  208. return boost::algorithm::boyer_moore
  209. <typename boost::range_iterator<const Range>::type> (boost::begin(r), boost::end(r));
  210. }
  211. template <typename Range>
  212. boost::algorithm::boyer_moore<typename boost::range_iterator<Range>::type>
  213. make_boyer_moore ( Range &r ) {
  214. return boost::algorithm::boyer_moore
  215. <typename boost::range_iterator<Range>::type> (boost::begin(r), boost::end(r));
  216. }
  217. }}
  218. #endif // BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP