Class: XWS

Inherits:
Object
  • Object
show all
Defined in:
lib/xws.rb

Instance Method Summary collapse

Constructor Details

#initialize(ignore_elements: %i(pre code time))) ⇒ XWS

Returns a new instance of XWS.



10
11
12
13
14
# File 'lib/xws.rb', line 10

def initialize(ignore_elements: i(pre code time))

  @ignore_elements = ignore_elements
  @ignorewords = i(the and or)
end

Instance Method Details

#scan(node) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/xws.rb', line 16

def scan(node)

  a = []

  node.each_recursive do |x|

    if not x.name[/#{@ignore_elements.join('|')}/] then
      a += x.texts.map(&:strip).select{|x| not x.empty?}
    end
  end

  words(a.join(' ')).split.group_by(&:to_s).\
      inject({}){|r, x| r.merge(String.new(x[0]).\
                                  force_encoding("utf-8") => x[-1].length)}

end

#words(s) ⇒ Object



33
34
35
36
37
38
39
40
41
42
# File 'lib/xws.rb', line 33

def words(s)

  s.downcase. 
    gsub(/\w+'\w+/,'').  # remove words containing an apostrophe
    gsub(/["']/,'').     # remove quotation marks
    gsub(/(\w)[^a-z ]+\B|\B[^a-z #]+(\w)/,'\1\2').     # remove non-alpabetical characters from start or beginning of words
    gsub(/\s.\s/,' ').      # remove single digits 
    gsub(/\b(?:#{@ignorewords.join('|')})\b/,'')
  
end