Class: MagickColumns::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/magick_columns/tokenizer.rb

Instance Method Summary collapse

Constructor Details

#initialize(query = '') ⇒ Tokenizer

Returns a new instance of Tokenizer.



3
4
5
# File 'lib/magick_columns/tokenizer.rb', line 3

def initialize(query = '')
  @query = query
end

Instance Method Details

#clean_queryObject



27
28
29
30
31
32
33
# File 'lib/magick_columns/tokenizer.rb', line 27

def clean_query
  @query.strip
    .gsub(%r{\A(\s*(#{MagickColumns.and_operators})\s+)+}, '')
    .gsub(%r{(\s+(#{MagickColumns.and_operators})\s*)+\z}, '')
    .gsub(%r{\A(\s*(#{MagickColumns.or_operators})\s+)+}, '')
    .gsub(%r{(\s+(#{MagickColumns.or_operators})\s*)+\z}, '')
end

#extract_termsObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/magick_columns/tokenizer.rb', line 7

def extract_terms
  terms = []

  clean_query.split(%r{\s+(#{MagickColumns.or_operators})\s+}).each do |o_t|
    unless o_t =~ %r{\A(#{MagickColumns.or_operators})\z}
      and_terms = []
      
      o_t.split(%r{\s+(#{MagickColumns.and_operators})\s+}).each do |t|
        unless t =~ %r{\A(#{MagickColumns.and_operators})\z}
          and_terms.concat split_term_in_terms(t)
        end
      end
      
      terms << and_terms unless and_terms.empty?
    end
  end

  terms.reject(&:empty?)
end

#split_term_in_terms(term) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/magick_columns/tokenizer.rb', line 35

def split_term_in_terms(term)
  term_copy = term.dup
  terms = []
  
  MagickColumns.replacement_rules.each do |rule, options|
    pattern = options[:pattern].respond_to?(:call) ?
      options[:pattern].call : options[:pattern]
    
    while(match = term_copy.match(pattern))
      term_copy.sub!(pattern, options[:replacement].call(match))
    end
  end
  
  MagickColumns.tokenize_rules.each do |rule, options|
    pattern = options[:pattern].respond_to?(:call) ?
      options[:pattern].call : options[:pattern]
    
    while(match = term_copy.match(pattern))
      terms << options[:tokenizer].call(match)
      
      term_copy.sub!(pattern, '')
    end
  end
  
  terms + term_copy.strip.split(/\s+/).map { |t| { term: t } }
end