Class: RSemantic::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/rsemantic/parser.rb

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Parser

Returns a new instance of Parser.



6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/rsemantic/parser.rb', line 6

def initialize(options = {})
  # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
  # TODO: nicer way to reference stop file location?
  @filter_stop_words = options[:filter_stop_words]
  @stem_words        = options[:stem_words]
  locale             = options[:locale] || 'en'

  if @filter_stop_words
    File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
      @stopwords = Set.new(file.read().split())
    end
  end
end

Instance Method Details

#clean(string) ⇒ Object

remove any nasty grammar tokens from string



26
27
28
29
30
31
# File 'lib/rsemantic/parser.rb', line 26

def clean(string)
  string = string.gsub(".","")
  string = string.gsub(/\s+/," ")
  string = string.downcase
  return string
end

#remove_stop_words(list) ⇒ Object

stop words are common words which have no search value



34
35
36
37
38
39
40
# File 'lib/rsemantic/parser.rb', line 34

def remove_stop_words(list)
  if @filter_stop_words
    list.select {|word| !@stopwords.include?(word) }
  else
    list
  end
end

#tokenise_and_filter(string) ⇒ Object



20
21
22
23
# File 'lib/rsemantic/parser.rb', line 20

def tokenise_and_filter(string)
  word_list = tokenise_and_stem(string)
  remove_stop_words(word_list)
end

#tokenise_and_stem(string) ⇒ Object



42
43
44
45
46
47
48
49
50
51
# File 'lib/rsemantic/parser.rb', line 42

def tokenise_and_stem(string)
  string = clean(string)
  words = string.split(" ")

  if @stem_words
    words.map(&:stem)
  else
    words
  end
end