Module: BookClean::Publisher

Defined in:
lib/bookclean.rb

Class Method Summary collapse

Class Method Details

.clean(str, lang = :pt) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/bookclean.rb', line 6

def self.clean(str, lang=:pt)
  return str if str==nil
  str=str.rstrip.lstrip
  str = UnicodeUtils.downcase(str)
  str.gsub!(/\s+/, ' ')
  str.gsub!(/\s*ltda.?$/, '')
  str.gsub!(/\slv$/, '')
  str.gsub!(/\ss\.a\.$/, '')
  str.gsub!(/\ss\.a$/, '')
  str.gsub!(/\ssa\.$/, '')
  str.gsub!(/\ss\.\sa\.$/, '')
  str.gsub!(/\ss\.\sa$/, '')
  str.gsub!(/^editora/, '') if !str.match(/^editora\s+..\s+/) #editora da mente should keep editora
  str.gsub!(/editora$/, '')
  str.gsub!('&', ' & ')
  str.gsub!(/\s+/, ' ')
  str.gsub!('Ã?', 'á')
  str.gsub!('ã?', 'á')
  str.gsub!(' ed.', '')
  str.gsub!(/\sed$/, '')
  str.gsub!(/\s-$/, '')




  #Split words in tokens
  #Match each token to dictionary of accented words.
  #Join words by space
  str = UnicodeUtils.titlecase(str)
  #Downcase Prepositions
  #Downcase Conjuntions
  #Remove extra space (end, middle and end)
  #Remove editora begin and end.
   
  str.gsub!(" Da ", " da ")
  str.gsub!(" De ", " de ")
  str.gsub!(" Do ", " do ")
  str.gsub!("çao", "ção")
  str.gsub!("çoes", "ções")
  
  str=str.rstrip.lstrip
end