Class: TreeTagger::Tagger
- Inherits:
-
Object
- Object
- TreeTagger::Tagger
- Defined in:
- lib/tree_tagger/tagger.rb
Overview
Class comment
Constant Summary collapse
- BEGIN_MARKER =
'<BEGIN_OF_THE_TT_INPUT>'
- END_MARKER =
'<END_OF_THE_TT_INPUT>'
- FLUSH_SENTENCE =
TT seems to hold only the last three tokens in the buffer. The flushing sentence can be shortened down to this size.
"Das\nist\nein\nTestsatz\n,\num\ndas\nStossen\nder\nDaten\nsicherzustellen\n."
Instance Method Summary collapse
-
#convert(input) ⇒ Object
private
Convert token arrays to delimited strings.
-
#flush ⇒ Object
Get the rest of the text back.
-
#get_output ⇒ Object
Get processed tokens back.
-
#initialize(opts = { :binary => nil, :model => nil, :lexicon => nil, :options => '-token -lemma -sgml -quiet', :replace_blanks => true, :blank_tag => '<BLANK>', :lookup => false }) ⇒ Tagger
constructor
Initializer commet.
-
#new_pipe ⇒ Object
private
This method may be utilized to keep the TT process alive.
-
#new_reader ⇒ Object
private
Starts the reader thread.
-
#process(input) ⇒ Object
Send the string to the TreeTagger.
- #sanitize(str) ⇒ Object private
-
#validate_options(opts) ⇒ Object
private
Return the options hash after validation.
Constructor Details
#initialize(opts = { :binary => nil, :model => nil, :lexicon => nil, :options => '-token -lemma -sgml -quiet', :replace_blanks => true, :blank_tag => '<BLANK>', :lookup => false }) ⇒ Tagger
Initializer commet
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/tree_tagger/tagger.rb', line 29 def initialize(opts = { :binary => nil, :model => nil, :lexicon => nil, :options => '-token -lemma -sgml -quiet', :replace_blanks => true, :blank_tag => '<BLANK>', :lookup => false } ) @opts = (opts) @blank_tag = @opts[:blank_tag] @cmdline = "#{@opts[:binary]} #{@opts[:options]} #{@opts[:model]}" @queue = Queue.new @pipe = new_pipe @pipe.sync = true @reader = new_reader @inside_output = false @inside_input = false @enqueued_tokens = 0 @mutex = Mutex.new @queue_mutex = Mutex.new # sleep(1) # Don't know if it's useful, no problems before. end |
Instance Method Details
#convert(input) ⇒ Object (private)
Convert token arrays to delimited strings.
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
# File 'lib/tree_tagger/tagger.rb', line 191 def convert(input) unless input.is_a?(Array) || input.is_a?(String) fail UserError, "Not a valid input format: <#{input.class}>!" end if input.empty? fail UserError, "Empty input is not allowed!" end if input.is_a?(Array) input.each do |el| unless el.is_a?(String) fail UserError, "Input elements should be strings!" end el = sanitize(el) end input = input.join("\n") end input end |
#flush ⇒ Object
Get the rest of the text back. TT holds some meaningful parts in the buffer.
100 101 102 103 104 105 106 107 |
# File 'lib/tree_tagger/tagger.rb', line 100 def flush @inside_input = false str = "#{END_MARKER}\n#{FLUSH_SENTENCE}\n" @pipe.print(str) # Here invoke the reader thread to ensure # all output has been read. #@reader.run end |
#get_output ⇒ Object
Get processed tokens back. This method is not blocking. If some tokens have been sent, but not received from the pipe yet, it returns an empty array. If all sent tokens are in the queue it returns all of them. If no more tokens are awaited it returns <nil>.
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/tree_tagger/tagger.rb', line 78 def get_output output = [] tokens = 0 @queue_mutex.synchronize do tokens = @queue.size tokens.times { output << @queue.shift } end @mutex.synchronize do @enqueued_tokens -= tokens end # Nil if nothing to process in the pipe. # Possible only after flushing the pipe. if @enqueued_tokens > 0 output else output.any? ? output : nil end end |
#new_pipe ⇒ Object (private)
This method may be utilized to keep the TT process alive. Check here if TT returns the exit code 1 in case on invalide options.
186 187 188 |
# File 'lib/tree_tagger/tagger.rb', line 186 def new_pipe IO.popen(@cmdline, 'r+') end |
#new_reader ⇒ Object (private)
Starts the reader thread.
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/tree_tagger/tagger.rb', line 162 def new_reader Thread.new do while line = @pipe.gets # The output strings must not contain "\n". line.chomp! case line when BEGIN_MARKER @inside_output = true $stderr.puts 'Found the begin marker.' if $DEBUG when END_MARKER @inside_output = false $stderr.puts 'Found the end marker.' if $DEBUG else if @inside_output @queue_mutex.synchronize { @queue << line } $stderr.puts "<#{line}> added to the queue." if $DEBUG end end end end # thread end |
#process(input) ⇒ Object
Send the string to the TreeTagger.
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/tree_tagger/tagger.rb', line 57 def process(input) str = convert(input) # Sanitize strings. str = sanitize(str) # Mark the beginning of the text. if not @inside_input str = "#{BEGIN_MARKER}\n#{str}\n" @inside_input = true else str = str + "\n" end @mutex.synchronize { @enqueued_tokens += 1 } @pipe.print(str) end |
#sanitize(str) ⇒ Object (private)
213 214 215 216 217 218 219 220 |
# File 'lib/tree_tagger/tagger.rb', line 213 def sanitize(str) line = str.strip if line.empty? line = @blank_tag end line end |
#validate_options(opts) ⇒ Object (private)
Return the options hash after validation.
{
:binary => nil,
:model => nil,
:lexicon => nil,
:options => '-token -lemma -sgml -quiet',
:replace_blanks => true,
:blank_tag => '<BLANK>',
:lookup => false
}
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/tree_tagger/tagger.rb', line 120 def (opts) # Check if <:lookup> is boolean. # Check if <:replace_blanks> is boolean. # Check if <:options> is a string. # Check if <:options> contains only allowed values. # Ensure that <:options> contains <-sgml>. # Check if <:blank_tag> is a string. # Ensure that <:blank_tag> is a valid SGML sequence. # Set the model and binary paths if not provided. [:binary, :model].each do |key| if opts[key].nil? opts[key] = ENV.fetch("TREETAGGER_#{key.to_s.upcase}") do |missing| fail UserError, "Provide a value for <:#{key}>" + " or set the environment variable <#{missing}>!" end end end # Set the lexicon path if not provided but requested. if opts[:lookup] && opts[:lexicon].nil? opts[:lookup] = ENV.fetch('TREETAGGER_LEXICON') do |missing| fail UserError, 'Provide a value for <:lexicon>' + ' or set the environment variable <TREETAGGER_LEXICON>!' end end # Check for existence and reedability of external files: # * binary; # * model; # * lexicon (if applicable). opts end |