Module: Rstt

Includes:
Celluloid
Defined in:
lib/rstt.rb,
lib/rstt/version.rb,
lib/rstt/preprocess.rb,
lib/rstt/tt_settings.rb

Defined Under Namespace

Modules: Preprocess

Constant Summary collapse

VERSION =
"0.9.4"

Class Method Summary collapse

Class Method Details

.build_tagging_commandObject



36
37
38
39
40
41
42
43
44
45
# File 'lib/rstt.rb', line 36

def self.build_tagging_command
  lang = get_command_language
  if LANGUAGES[lang][:utf8]
    cmd = "tree-tagger-#{lang}-utf8"
  else
    cmd = "tree-tagger-#{lang}"
  end
  
  cmd
end

.get_command_languageObject



47
48
49
50
51
52
53
54
55
56
57
# File 'lib/rstt.rb', line 47

def self.get_command_language
  lang = language_codes[self.lang.to_sym]

  if lang.nil?
    raise "language not supported"
  elsif LANGUAGES[lang].nil?
    raise "language supported, but not installed"
  end
  
  lang
end

.installed_language_codesObject



82
83
84
85
86
87
88
89
# File 'lib/rstt.rb', line 82

def self.installed_language_codes
  installed = {}
  language_codes.each do |lang|
    installed[lang.first] = lang.last unless LANGUAGES[lang.last].nil?
  end
  
  installed
end

.language_codesObject



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/rstt.rb', line 66

def self.language_codes
  { bg: "bulgarian",
    nl: "dutch",
    en: "english",
    et: "estonian",
    fr: "french",
    de: "german",
    el: "greek",
    it: "italian",
    la: "latin",
    ru: "russian", 
    es: "spanish",
    sw: "swahili"
  }
end

.preprocessingObject



59
60
61
62
63
64
# File 'lib/rstt.rb', line 59

def self.preprocessing
  @@origin = @@content
  # its important, that first html tags would be stripped and then non word characters
  Preprocess.strip_html_tags(self.content)
  Preprocess.strip_punctation_and_non_word_caracters(self.content)
end

output and processing helpers



92
93
94
95
# File 'lib/rstt.rb', line 92

def self.print
  p @@lang
  p @@content
end

.set_input(input = {lang: "", content: ""}) ⇒ Object



16
17
18
19
20
21
22
23
# File 'lib/rstt.rb', line 16

def self.set_input(input = {lang: "", content: ""})
  if input[:lang]
    @@lang = input[:lang]
  else
    @@lang = "en"
  end
  @@content = input[:content]
end

.taggingObject

tagging stage related methods



26
27
28
29
30
31
32
33
34
# File 'lib/rstt.rb', line 26

def self.tagging
  bar = `echo #{self.content} | #{TT_HOME}/cmd/#{build_tagging_command}`
  # @@tagged = bar.split("\n").collect{|word| word.split("\t") }
  @@tagged = bar.split("\n").collect do |word|
    metrik = word.split("\t")
    # use singular attribute names
    {word: metrik[0], tag: metrik[1], stem: metrik[2]}
  end
end