1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
// Copyright 2014-2016 Johannes Köster. // Licensed under the MIT license (http://opensource.org/licenses/MIT) // This file may not be copied, modified, or distributed // except according to those terms. //! Algorithm of Horspool. //! Window-based, similar to but faster than Boyer-Moore. //! //! # Idea //! Look at a search window m, match pattern backwards. //! In case of a mismatch, you can jump behind that. //! Best case time complexity: O(n / m) //! Worst case time complexity: O(n * m) //! With a large alphabet, you are likely //! around the best case, and faster than the rather //! complicated Boyer-Moore. //! //! The algorithm has two phases (let a be the last symbol in the window): //! //! 1. test phase: compare the last symbol of the window. //! If it matches, compare the whole pattern. //! If it does not match, continue with the shift phase. //! 2. shift phase: let l[a] be the rightmost position of a in //! the pattern without the last symbol. If it does not occur //! let l[a] be -1. Shift the window by m - 1 - l[a]. I.e. //! we shift the window such that the rightmost a matches //! the a at the end of the last window. //! If a does not occur in the pattern, we shift by the whole length. //! //! # Example //! //! ``` //! use bio::pattern_matching::horspool::Horspool; //! let text = b"ACGGCTAGGAAAAAGACTGAGGACTGAAAA"; //! let pattern = b"GAAAA"; //! let horspool = Horspool::new(pattern); //! let occ: Vec<usize> = horspool.find_all(text).collect(); //! assert_eq!(occ, [8, 25]); //! ``` use utils::TextSlice; /// Algorithm of Horspool. pub struct Horspool<'a> { shift: Vec<usize>, m: usize, pattern: TextSlice<'a>, } impl<'a> Horspool<'a> { /// Create a new instance for a given pattern. pub fn new(pattern: TextSlice<'a>) -> Self { let m = pattern.len(); let mut shift = vec![m; 256]; // shift is m for all not occurring characters // and m - 1 - j for all others for (j, &a) in pattern[..m - 1].iter().enumerate() { shift[a as usize] = m - 1 - j; } Horspool { m: m, shift: shift, pattern: pattern, } } /// Find all matches with a given text. Matches are returned as an iterator over start positions. pub fn find_all<'b>(&'b self, text: TextSlice<'b>) -> Matches { Matches { horspool: self, text: text, n: text.len(), last: self.m - 1, pattern_last: self.pattern[self.m - 1], } } } /// Iterator over start positions of matches. pub struct Matches<'a> { horspool: &'a Horspool<'a>, text: TextSlice<'a>, n: usize, last: usize, pattern_last: u8, } impl<'a> Iterator for Matches<'a> { type Item = usize; fn next(&mut self) -> Option<usize> { loop { // shift until the last symbol matches while self.last < self.n && self.text[self.last] != self.pattern_last { self.last += self.horspool.shift[self.text[self.last] as usize]; } // stop if end of text is reached if self.last >= self.n { return None; } // putative start position let i = self.last + 1 - self.horspool.m; let j = self.last; // shift again (after both match and mismatch, this makes sense) self.last += self.horspool.shift[self.pattern_last as usize]; if self.text[i..j] == self.horspool.pattern[..self.horspool.m - 1] { return Some(i); } } } } #[cfg(test)] mod tests { use super::Horspool; #[test] fn test_shift() { let pattern = b"AACB"; let horspool = Horspool::new(pattern); assert_eq!(horspool.shift[b'A' as usize], 2); assert_eq!(horspool.shift[b'C' as usize], 1); assert_eq!(horspool.shift[b'B' as usize], 4); assert_eq!(horspool.shift[b'X' as usize], 4); } }