Class: Flashtext::KeywordProcessor
- Inherits:
-
Object
- Object
- Flashtext::KeywordProcessor
- Defined in:
- lib/flashtext/keyword_processor.rb
Instance Attribute Summary collapse
-
#_keyword ⇒ Object
Returns the value of attribute _keyword.
-
#_white_space_chars ⇒ Object
Returns the value of attribute _white_space_chars.
-
#case_sensitive ⇒ Object
Returns the value of attribute case_sensitive.
-
#keyword_trie_hash ⇒ Object
Returns the value of attribute keyword_trie_hash.
-
#word_boundaries ⇒ Object
Returns the value of attribute word_boundaries.
Instance Method Summary collapse
- #add_keyword(keyword, clean_name = nil) ⇒ Object
- #add_keywords_from_hash(keyword_hash) ⇒ Object
- #extract_keywords(sentence) ⇒ Object
-
#initialize(case_sensitive = false) ⇒ KeywordProcessor
constructor
A new instance of KeywordProcessor.
- #replace_keywords(sentence) ⇒ Object
Constructor Details
#initialize(case_sensitive = false) ⇒ KeywordProcessor
Returns a new instance of KeywordProcessor.
7 8 9 10 11 12 13 |
# File 'lib/flashtext/keyword_processor.rb', line 7 def initialize case_sensitive = false self._keyword = '_keyword_' self._white_space_chars = Set.new(['.', '\t', '\n', '\a', ' ', ',']) self.keyword_trie_hash = {} self.case_sensitive = case_sensitive self.word_boundaries = Set.new("0".upto("9").to_a + "A".upto("Z").to_a + "a".upto("z").to_a + ["_"]) end |
Instance Attribute Details
#_keyword ⇒ Object
Returns the value of attribute _keyword.
5 6 7 |
# File 'lib/flashtext/keyword_processor.rb', line 5 def _keyword @_keyword end |
#_white_space_chars ⇒ Object
Returns the value of attribute _white_space_chars.
5 6 7 |
# File 'lib/flashtext/keyword_processor.rb', line 5 def _white_space_chars @_white_space_chars end |
#case_sensitive ⇒ Object
Returns the value of attribute case_sensitive.
5 6 7 |
# File 'lib/flashtext/keyword_processor.rb', line 5 def case_sensitive @case_sensitive end |
#keyword_trie_hash ⇒ Object
Returns the value of attribute keyword_trie_hash.
5 6 7 |
# File 'lib/flashtext/keyword_processor.rb', line 5 def keyword_trie_hash @keyword_trie_hash end |
#word_boundaries ⇒ Object
Returns the value of attribute word_boundaries.
5 6 7 |
# File 'lib/flashtext/keyword_processor.rb', line 5 def word_boundaries @word_boundaries end |
Instance Method Details
#add_keyword(keyword, clean_name = nil) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/flashtext/keyword_processor.rb', line 15 def add_keyword keyword, clean_name = nil if not clean_name and keyword clean_name = keyword end if keyword and clean_name keyword = keyword.downcase if not case_sensitive current_hash = keyword_trie_hash keyword.each_char do |char| current_hash = if current_hash.has_key?(char) current_hash[char] else current_hash[char] = {} current_hash[char] end end current_hash[_keyword] = clean_name end end |
#add_keywords_from_hash(keyword_hash) ⇒ Object
35 36 37 38 39 40 41 42 43 |
# File 'lib/flashtext/keyword_processor.rb', line 35 def add_keywords_from_hash keyword_hash raise ArgumentError, "#{keyword_hash} is not hash. argument expected: Hash" unless keyword_hash.instance_of?(Hash) keyword_hash.each do |clean_name, keywords| raise ArgumentError, "#{keyword_hash['clean_name']} is not array. expected: Array" unless keywords.instance_of?(Array) keywords.each do |keyword| add_keyword(keyword.to_s, clean_name.to_s) end end end |
#extract_keywords(sentence) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/flashtext/keyword_processor.rb', line 45 def extract_keywords sentence keywords_extracted = [] keywords_extracted if not sentence #if sentence is empty or none just return empty list sentence = sentence.downcase if not case_sensitive current_hash = keyword_trie_hash sequence_end_pos = 0 idx = 0 sentence_len = sentence.length while idx < sentence_len char = sentence[idx] # when we reach a character that might denote word end if not word_boundaries.member?(char) # If end is present OR ?? (confused) if current_hash.has_key?(_keyword) or current_hash.has_key?(char) # Update longest sequence found sequence_found = nil longest_sequence_found = nil is_longer_seq_found = false if current_hash.has_key?(_keyword) sequence_found = current_hash[_keyword] longest_sequence_found = current_hash[_keyword] sequence_end_pos = idx end # re look for longest_sequence from this position if current_hash.has_key?(char) current_hash_continued = current_hash[char] idy = idx + 1 while idy < sentence_len inner_char = sentence[idy] if not word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword) # update longest sequence found. This will keep updating longest_sequence if exists. longest_sequence_found = current_hash_continued[_keyword] sequence_end_pos = idy is_longer_seq_found = true end if current_hash_continued.has_key?(inner_char) current_hash_continued = current_hash_continued[inner_char] else break end idy = idy + 1 end # checked for end of sentenance if idy == sentence_len and current_hash_continued.has_key?(_keyword) # Update longest sequence found longest_sequence_found = current_hash_continued[_keyword] sequence_end_pos = idy is_longer_seq_found = true end idx = sequence_end_pos if is_longer_seq_found end current_hash = keyword_trie_hash # reset if longest_sequence_found keywords_extracted << longest_sequence_found end else # reset current_hash current_hash = keyword_trie_hash end elsif current_hash.has_key?(char) # we can continue from this char current_hash = current_hash[char] else # we reset current_hash current_hash = keyword_trie_hash # skip to end of keyword while idx < sentence_len char = sentence[idx] break if not word_boundaries.member?(char) idx = idx + 1 end end # if we are end of sentence and have a sequence discovered if idx + 1 >= sentence_len if current_hash.has_key?(_keyword) sequence_found = current_hash[_keyword] keywords_extracted << sequence_found end end idx = idx + 1 # loop increment. end keywords_extracted end |
#replace_keywords(sentence) ⇒ Object
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
# File 'lib/flashtext/keyword_processor.rb', line 134 def replace_keywords sentence if sentence.nil? || sentence.empty? return sentence end new_sentence = "" original_sentence = sentence sentence = sentence.downcase if not case_sensitive current_word = "" current_hash = keyword_trie_hash current_white_space = "" sequence_end_pos = 0 idx = 0 sentence_len = sentence.length while idx < sentence_len char = sentence[idx] current_word += original_sentence[idx] if not word_boundaries.member?(char) current_white_space = char if current_hash.has_key?(_keyword) or current_hash.has_key?(char) # update longest sequence found sequence_found = nil longest_sequence_found = nil is_longer_seq_found = false if current_hash.has_key?(_keyword) sequence_found = current_hash[_keyword] longest_sequence_found = current_hash[_keyword] sequence_end_pos = idx end # re look for longest_sequence from this position if current_hash.has_key?(char) current_hash_continued = current_hash[char] current_word_continued = current_word idy = idx + 1 while idy < sentence_len inner_char = sentence[idy] current_word_continued += original_sentence[idy] if !word_boundaries.member?(inner_char) and current_hash_continued.has_key?(_keyword) # Update longest sequence found current_white_space = inner_char longest_sequence_found = current_hash_continued[_keyword] sequence_end_pos = idy is_longer_seq_found = true end if current_hash_continued.has_key?(inner_char) current_hash_continued = current_hash_continued[inner_char] else break end idy += 1 end if idy == sentence_len # end of sentence reached. if current_hash_continued.member?(_keyword) # update longest sequence found current_white_space = "" longest_sequence_found = current_hash_continued[_keyword] sequence_end_pos = idy is_longer_seq_found = true end end if is_longer_seq_found idx = sequence_end_pos current_word = current_word_continued end end current_hash = keyword_trie_hash if longest_sequence_found new_sentence += (longest_sequence_found + current_white_space) current_word = '' current_white_space = '' else new_sentence += current_word current_word = '' current_white_space = '' end else # we reset current_hash current_hash = keyword_trie_hash new_sentence += current_word current_word = '' current_white_space = '' end elsif current_hash.has_key?(char) # we can continue from this char current_hash = current_hash[char] else # reset current_hash current_hash = keyword_trie_hash idy = idx + 1 while idy < sentence_len char = sentence[idy] current_word += original_sentence[idy] break if not word_boundaries.member?(char) idy += 1 end idx = idy new_sentence += current_word current_word = "" current_white_space = "" end if idx + 1 >= sentence_len && current_hash.has_key?(_keyword) sequence_found = current_hash[_keyword] new_sentence += sequence_found end idx = idx + 1 # loop increment end return new_sentence end |