Class: Picky::Tokenizer

Inherits:
Object show all
Extended by:
Helpers::Identification
Includes:
API::Tokenizer::CharacterSubstituter, API::Tokenizer::Stemmer
Defined in:
lib/picky/tokenizer.rb,
lib/picky/tokenizer/regexp_wrapper.rb

Overview

Defines tokenizing processes used both in indexing and querying.

Defined Under Namespace

Classes: RegexpWrapper

Constant Summary collapse

@@non_single_stopword_regexp =
/^\b[\w:]+?\b[\.\*\~]?\s?$/

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Helpers::Identification

identifier_for

Methods included from API::Tokenizer::Stemmer

#extract_stemmer

Methods included from API::Tokenizer::CharacterSubstituter

#extract_character_substituter

Constructor Details

#initialize(options = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.



205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/picky/tokenizer.rb', line 205

def initialize options = {}
  options = default_options.merge options
  options.each do |method_name, value|
    send method_name, value unless value.nil?
  end
rescue NoMethodError => e
  raise "The option \"\#{e.name}\" is not a valid option for a Picky tokenizer.\nPlease see https://github.com/floere/picky/wiki/Indexing-configuration for valid options.\nA short overview:\n  removes_characters          /regexp/\n  stopwords                   /regexp/\n  splits_text_on              /regexp/ or \"String\", default /\\s/\n  normalizes_words            [[/replace (this)/, 'with this \\\\1'], ...]\n  rejects_token_if            Proc/lambda, default :empty?.to_proc\n  substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)\n  stems_with                  Instance responds to #stem(String)\n  case_sensitive              true/false\n\n"
end

Instance Attribute Details

#stemmerObject (readonly) Also known as: stemmer?

Returns the value of attribute stemmer.



201
202
203
# File 'lib/picky/tokenizer.rb', line 201

def stemmer
  @stemmer
end

#substituterObject (readonly) Also known as: substituter?

Returns the value of attribute substituter.



201
202
203
# File 'lib/picky/tokenizer.rb', line 201

def substituter
  @substituter
end

Class Method Details

.default_indexing_with(options = {}) ⇒ Object



13
14
15
# File 'lib/picky/tokenizer.rb', line 13

def self.default_indexing_with options = {}
  @indexing = from options
end

.default_searching_with(options = {}) ⇒ Object



20
21
22
# File 'lib/picky/tokenizer.rb', line 20

def self.default_searching_with options = {}
  @searching = from options
end

.from(thing, index_name = nil, category_name = nil) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/picky/tokenizer.rb', line 27

def self.from thing, index_name = nil, category_name = nil
  return unless thing
    
  if thing.respond_to? :tokenize
    thing
  else
    if thing.respond_to? :[]
      Picky::Tokenizer.new thing
    else
      raise "indexing options \#{identifier_for(index_name, category_name)}should be either\n* a Hash\nor\n* an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]]\n"
    end
  end
end

.indexingObject



16
17
18
# File 'lib/picky/tokenizer.rb', line 16

def self.indexing
  @indexing ||= new
end

.searchingObject



23
24
25
# File 'lib/picky/tokenizer.rb', line 23

def self.searching
  @searching ||= new
end

Instance Method Details

#cap(words) ⇒ Object



185
186
187
# File 'lib/picky/tokenizer.rb', line 185

def cap words
  words.slice!(@max_words..-1) if cap?(words)
end

#cap?(words) ⇒ Boolean

Returns:

  • (Boolean)


188
189
190
# File 'lib/picky/tokenizer.rb', line 188

def cap? words
  @max_words && words.size > @max_words
end

#case_sensitive(case_sensitive) ⇒ Object

Case sensitivity.

Note: If false, simply downcases the data/query.



172
173
174
# File 'lib/picky/tokenizer.rb', line 172

def case_sensitive case_sensitive
  @case_sensitive = case_sensitive
end

#check_argument_in(method, types, argument, &condition) ⇒ Object

Checks if the right argument type has been given.



194
195
196
197
198
199
# File 'lib/picky/tokenizer.rb', line 194

def check_argument_in method, types, argument, &condition
  types = [*types]
  unless types.any? { |type| type === argument }
    raise ArgumentError.new "Application##{method} takes any of #{types.join(', ')} as argument, but not a #{argument.class}."
  end
end

#default_optionsObject



226
227
228
229
230
231
# File 'lib/picky/tokenizer.rb', line 226

def default_options
  {
    splits_text_on: /\s/,
    rejects_token_if: :empty?.to_proc
  }
end

#downcase?Boolean

Returns:

  • (Boolean)


175
176
177
# File 'lib/picky/tokenizer.rb', line 175

def downcase?
  !@case_sensitive
end

#empty_tokensObject

Returns empty tokens.



291
292
293
# File 'lib/picky/tokenizer.rb', line 291

def empty_tokens
  [[], []]
end

#max_words(amount) ⇒ Object

The maximum amount of words to pass into the search engine.



182
183
184
# File 'lib/picky/tokenizer.rb', line 182

def max_words amount
  @max_words = amount
end

#normalize_with_patterns(text) ⇒ Object



124
125
126
127
128
129
130
131
132
133
134
# File 'lib/picky/tokenizer.rb', line 124

def normalize_with_patterns text
  return text unless @normalizes_words_regexp_replaces # TODO Remove.

  @normalizes_words_regexp_replaces.each do |regex, replace|
    # This should be sufficient
    #
    text.gsub!(regex, replace) and break
  end

  text
end

#normalize_with_patterns?Boolean

Returns:

  • (Boolean)


135
136
137
# File 'lib/picky/tokenizer.rb', line 135

def normalize_with_patterns?
  @normalizes_words_regexp_replaces
end

#normalizes_words(regexp_replaces) ⇒ Object

Normalizing.

We only allow arrays.

TODO 5.0 Rename to normalizes(config) or normalizes_words TODO 5.0 Rename to normalize(text) or normalize_words



120
121
122
123
# File 'lib/picky/tokenizer.rb', line 120

def normalizes_words regexp_replaces
  raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary) || regexp_replaces.respond_to?(:normalize_with_patterns)
  @normalizes_words_regexp_replaces = regexp_replaces
end

#preprocess(text) ⇒ Object

Default preprocessing hook.

Does:

  1. Character substitution.

  2. Remove illegal expressions.

  3. Remove non-single stopwords. (Stopwords that occur with other words)



255
256
257
258
259
260
261
262
263
264
265
# File 'lib/picky/tokenizer.rb', line 255

def preprocess text
  text = substitute_characters text
  remove_illegals text
  # We do not remove single stopwords e.g. in the indexer for
  # an entirely different reason than in the query tokenizer.
  # An indexed thing with just name "UND" (a possible stopword)
  # should not lose its name.
  #
  remove_non_single_stopwords text
  text
end

#pretokenize(text) ⇒ Object

Pretokenizing.

Does:

* Split the text into words.
* Cap the amount of tokens if max_words is set.


273
274
275
276
277
278
279
# File 'lib/picky/tokenizer.rb', line 273

def pretokenize text
  words = split text
  words.collect! { |word| normalize_with_patterns word } if normalize_with_patterns?
  reject words
  cap words if cap?(words)
  words
end

#reject(tokens) ⇒ Object



164
165
166
# File 'lib/picky/tokenizer.rb', line 164

def reject tokens
  tokens.reject! &@reject_condition
end

#rejects_token_if(condition) ⇒ Object

Reject tokens after tokenizing based on the given criteria.



161
162
163
# File 'lib/picky/tokenizer.rb', line 161

def rejects_token_if condition
  @reject_condition = condition
end

#remove_illegals(text) ⇒ Object



88
89
90
91
# File 'lib/picky/tokenizer.rb', line 88

def remove_illegals text
  text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
  text
end

#remove_non_single_stopwords(text) ⇒ Object



73
74
75
76
77
# File 'lib/picky/tokenizer.rb', line 73

def remove_non_single_stopwords text
  return text unless @remove_stopwords_regexp
  return text if text.match @@non_single_stopword_regexp
  remove_stopwords text
end

#remove_stopwords(text) ⇒ Object



68
69
70
71
# File 'lib/picky/tokenizer.rb', line 68

def remove_stopwords text
  text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
  text
end

#removes_characters(regexp) ⇒ Object

Illegals.

We only allow regexps (even if string would be okay too for gsub! - it’s too hard to understand)



84
85
86
87
# File 'lib/picky/tokenizer.rb', line 84

def removes_characters regexp
  check_argument_in __method__, [Regexp, FalseClass], regexp
  @removes_characters_regexp = regexp
end

#split(text) ⇒ Object



107
108
109
110
111
# File 'lib/picky/tokenizer.rb', line 107

def split text
  # Does not create a new string if nothing is split.
  #
  @splits_text_on.split text
end

#splits_text_on(thing) ⇒ Object

Splitting.

We allow Strings, Regexps, and things that respond to #split.

Note: We do not test against to_str since symbols do not work with String#split.



99
100
101
102
103
104
105
106
# File 'lib/picky/tokenizer.rb', line 99

def splits_text_on thing
  raise ArgumentError.new "#{__method__} takes a Regexp or a thing that responds to #split as argument, not a #{thing.class}." unless Regexp === thing || thing.respond_to?(:split)
  @splits_text_on = if thing.respond_to? :split
    thing
  else
    RegexpWrapper.new thing
  end
end

#stem(text) ⇒ Object



155
156
157
# File 'lib/picky/tokenizer.rb', line 155

def stem text
  stemmer?? stemmer.stem(text) : text
end

#stems_with(stemmer) ⇒ Object

Stems tokens with this stemmer.



152
153
154
# File 'lib/picky/tokenizer.rb', line 152

def stems_with stemmer
  @stemmer = extract_stemmer stemmer
end

#stopwords(regexp) ⇒ Object

Stopwords.

We even allow Strings even if it’s hard to understand.



64
65
66
67
# File 'lib/picky/tokenizer.rb', line 64

def stopwords regexp
  check_argument_in __method__, [Regexp, String, FalseClass], regexp
  @remove_stopwords_regexp = regexp
end

#substitute_characters(text) ⇒ Object



146
147
148
# File 'lib/picky/tokenizer.rb', line 146

def substitute_characters text
  substituter?? substituter.substitute(text) : text
end

#substitutes_characters_with(substituter = CharacterSubstituters::WestEuropean.new) ⇒ Object

Substitute Characters with this substituter.

Default is European Character substitution.



143
144
145
# File 'lib/picky/tokenizer.rb', line 143

def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
  @substituter = extract_character_substituter substituter
end

#to_sObject



46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/picky/tokenizer.rb', line 46

def to_s
  reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
  "Removes characters: \#{@removes_characters_regexp ? \"/\#{@removes_characters_regexp.source}/\" : '-'}\nStopwords:          \#{@remove_stopwords_regexp ? \"/\#{@remove_stopwords_regexp.source}/\" : '-'}\nSplits text on:     \#{@splits_text_on.respond_to?(:source) ? \"/\#{@splits_text_on.source}/\" : (@splits_text_on ? @splits_text_on : '-')}\nNormalizes words:   \#{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}\nRejects tokens?     \#{reject_condition_location ? \"Yes, see line \#{reject_condition_location} in app/application.rb\" : '-'}\nSubstitutes chars?  \#{@substituter ? \"Yes, using \#{@substituter}.\" : '-' }\nStems?              \#{@stemmer ? \"Yes, using \#{@stemmer}.\" : '-' }\nCase sensitive?     \#{@case_sensitive ? \"Yes.\" : \"-\"}\n  TOKENIZER\nend\n"

#tokenize(text) ⇒ Object

Returns a number of tokens, generated from the given text, based on the parameters given.

Returns:

[[:token1, :token2], ["Original1", "Original2"]]


239
240
241
242
243
244
245
246
# File 'lib/picky/tokenizer.rb', line 239

def tokenize text
  text = preprocess text.to_s # processing the text
  return empty_tokens if text.empty? # TODO blank?
  words = pretokenize text # splitting and preparations for tokenizing
  return empty_tokens if words.empty?
  tokens = tokens_for words # creating tokens / strings
  [tokens, words]
end

#tokens_for(words) ⇒ Object

Downcases.



283
284
285
286
287
# File 'lib/picky/tokenizer.rb', line 283

def tokens_for words
  words.collect! { |word| word.downcase!; word } if downcase?
  words.collect! { |word| stem word } if stemmer? # Usually only done in indexing step.
  words
end