Module: Rstt
- Includes:
- Celluloid
- Defined in:
- lib/rstt.rb,
lib/rstt/version.rb,
lib/rstt/preprocess.rb,
lib/rstt/tt_settings.rb
Defined Under Namespace
Modules: Preprocess
Constant Summary collapse
- VERSION =
"0.9.4"
Class Method Summary collapse
- .build_tagging_command ⇒ Object
- .get_command_language ⇒ Object
- .installed_language_codes ⇒ Object
- .language_codes ⇒ Object
- .preprocessing ⇒ Object
-
.print ⇒ Object
output and processing helpers.
- .set_input(input = {lang: "", content: ""}) ⇒ Object
-
.tagging ⇒ Object
tagging stage related methods.
Class Method Details
.build_tagging_command ⇒ Object
36 37 38 39 40 41 42 43 44 45 |
# File 'lib/rstt.rb', line 36 def self.build_tagging_command lang = get_command_language if LANGUAGES[lang][:utf8] cmd = "tree-tagger-#{lang}-utf8" else cmd = "tree-tagger-#{lang}" end cmd end |
.get_command_language ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/rstt.rb', line 47 def self.get_command_language lang = language_codes[self.lang.to_sym] if lang.nil? raise "language not supported" elsif LANGUAGES[lang].nil? raise "language supported, but not installed" end lang end |
.installed_language_codes ⇒ Object
82 83 84 85 86 87 88 89 |
# File 'lib/rstt.rb', line 82 def self.installed_language_codes installed = {} language_codes.each do |lang| installed[lang.first] = lang.last unless LANGUAGES[lang.last].nil? end installed end |
.language_codes ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/rstt.rb', line 66 def self.language_codes { bg: "bulgarian", nl: "dutch", en: "english", et: "estonian", fr: "french", de: "german", el: "greek", it: "italian", la: "latin", ru: "russian", es: "spanish", sw: "swahili" } end |
.preprocessing ⇒ Object
59 60 61 62 63 64 |
# File 'lib/rstt.rb', line 59 def self.preprocessing @@origin = @@content # its important, that first html tags would be stripped and then non word characters Preprocess.(self.content) Preprocess.strip_punctation_and_non_word_caracters(self.content) end |
.print ⇒ Object
output and processing helpers
92 93 94 95 |
# File 'lib/rstt.rb', line 92 def self.print p @@lang p @@content end |
.set_input(input = {lang: "", content: ""}) ⇒ Object
16 17 18 19 20 21 22 23 |
# File 'lib/rstt.rb', line 16 def self.set_input(input = {lang: "", content: ""}) if input[:lang] @@lang = input[:lang] else @@lang = "en" end @@content = input[:content] end |
.tagging ⇒ Object
tagging stage related methods
26 27 28 29 30 31 32 33 34 |
# File 'lib/rstt.rb', line 26 def self.tagging = `echo #{self.content} | #{TT_HOME}/cmd/#{build_tagging_command}` # @@tagged = bar.split("\n").collect{|word| word.split("\t") } @@tagged = .split("\n").collect do |word| metrik = word.split("\t") # use singular attribute names {word: metrik[0], tag: metrik[1], stem: metrik[2]} end end |