Module: PristineText

Defined in:
lib/pristine_text.rb,
lib/pristine_text/version.rb

Constant Summary collapse

VERSION =
"0.0.3"

Class Method Summary collapse

Class Method Details

.clean(text, locale = :en, stem = true) ⇒ Object



26
27
28
29
30
31
32
33
34
35
# File 'lib/pristine_text.rb', line 26

def self.clean(text, locale= :en, stem= true)
  text= UnicodeUtils.downcase(HTMLEntities.new.decode(text), locale).
    gsub(/[^\p{Letter}\s]+/, "").
    strip.squeeze
  if stem
    stem(text.split, locale).join " "
  else
    text
  end
end

.pipe(text, locale) ⇒ Object



7
8
9
10
11
12
13
14
15
16
# File 'lib/pristine_text.rb', line 7

def self.pipe(text, locale)
  if `which stemwords`.empty?
    raise LoadError.new("cannot find stemwords, install libstemmer-tools")
  end
  Open3.popen3("stemwords -l #{locale}") do |i, o, e, t|
    i.write text
    i.close
    o.read.strip
  end
end

.stem(text, locale) ⇒ Object



18
19
20
21
22
23
24
# File 'lib/pristine_text.rb', line 18

def self.stem(text, locale)
  if text.kind_of?(Array)
    pipe(text.join("\n"), locale).split("\n")
  elsif text.kind_of?(String)
    pipe text, locale
  end
end