Class: TreeTagger::Tagger

Inherits:
Object
  • Object
show all
Defined in:
lib/tree_tagger/tagger.rb

Overview

Class comment

Constant Summary collapse

BEGIN_MARKER =
'<BEGIN_OF_THE_TT_INPUT>'
END_MARKER =
'<END_OF_THE_TT_INPUT>'
FLUSH_SENTENCE =

TT seems to hold only the last three tokens in the buffer. The flushing sentence can be shortened down to this size.

"Das\nist\nein\nTestsatz\n,\num\ndas\nStossen\nder\nDaten\nsicherzustellen\n."

Instance Method Summary collapse

Constructor Details

#initialize(opts = { :binary => nil, :model => nil, :lexicon => nil, :options => '-token -lemma -sgml -quiet', :replace_blanks => true, :blank_tag => '<BLANK>', :lookup => false }) ⇒ Tagger

Initializer commet



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/tree_tagger/tagger.rb', line 29

def initialize(opts = {
                 :binary => nil,
                 :model => nil,
                 :lexicon => nil,
                 :options => '-token -lemma -sgml -quiet',
                 :replace_blanks => true,
                 :blank_tag => '<BLANK>',
                 :lookup => false
               }
               )

  @opts = validate_options(opts)
  @blank_tag = @opts[:blank_tag]
  @cmdline = "#{@opts[:binary]} #{@opts[:options]} #{@opts[:model]}"

  @queue = Queue.new
  @pipe = new_pipe
  @pipe.sync = true
  @reader = new_reader
  @inside_output = false
  @inside_input = false
  @enqueued_tokens = 0
  @mutex = Mutex.new
  @queue_mutex = Mutex.new
  # sleep(1) # Don't know if it's useful, no problems before.
end

Instance Method Details

#convert(input) ⇒ Object (private)

Convert token arrays to delimited strings.



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/tree_tagger/tagger.rb', line 191

def convert(input)
  unless input.is_a?(Array) || input.is_a?(String)
    fail UserError, "Not a valid input format: <#{input.class}>!"
  end

  if input.empty?
    fail UserError, "Empty input is not allowed!"
  end
  
  if input.is_a?(Array)
    input.each do |el|
      unless el.is_a?(String)
        fail UserError, "Input elements should be strings!"
      end
      el = sanitize(el)
    end
    input = input.join("\n")
  end
  
  input
end

#flushObject

Get the rest of the text back. TT holds some meaningful parts in the buffer.



100
101
102
103
104
105
106
107
# File 'lib/tree_tagger/tagger.rb', line 100

def flush
  @inside_input = false
  str = "#{END_MARKER}\n#{FLUSH_SENTENCE}\n"
  @pipe.print(str)
  # Here invoke the reader thread to ensure
  # all output has been read.
  #@reader.run
end

#get_outputObject

Get processed tokens back. This method is not blocking. If some tokens have been sent, but not received from the pipe yet, it returns an empty array. If all sent tokens are in the queue it returns all of them. If no more tokens are awaited it returns <nil>.



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/tree_tagger/tagger.rb', line 78

def get_output
  output = []
  tokens = 0
  @queue_mutex.synchronize do
    tokens = @queue.size
    tokens.times { output << @queue.shift }
  end
  @mutex.synchronize do
    @enqueued_tokens -= tokens
  end

  # Nil if nothing to process in the pipe.
  # Possible only after flushing the pipe.
  if @enqueued_tokens > 0
    output
  else
    output.any? ? output : nil
  end
end

#new_pipeObject (private)

This method may be utilized to keep the TT process alive. Check here if TT returns the exit code 1 in case on invalide options.



186
187
188
# File 'lib/tree_tagger/tagger.rb', line 186

def new_pipe
  IO.popen(@cmdline, 'r+')
end

#new_readerObject (private)

Starts the reader thread.



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/tree_tagger/tagger.rb', line 162

def new_reader
  Thread.new do
    while line = @pipe.gets
      # The output strings must not contain "\n".
      line.chomp!
      case line
      when BEGIN_MARKER
        @inside_output = true
        $stderr.puts 'Found the begin marker.' if $DEBUG
      when END_MARKER
        @inside_output = false
        $stderr.puts 'Found the end marker.' if $DEBUG
      else
        if @inside_output
          @queue_mutex.synchronize { @queue << line }
          $stderr.puts "<#{line}> added to the queue." if $DEBUG
        end
      end
    end
  end # thread
end

#process(input) ⇒ Object

Send the string to the TreeTagger.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/tree_tagger/tagger.rb', line 57

def process(input)

  str = convert(input)
  # Sanitize strings.
  str = sanitize(str)
  # Mark the beginning of the text.
  if not @inside_input
    str = "#{BEGIN_MARKER}\n#{str}\n"
    @inside_input = true
  else
    str = str + "\n"
  end
  @mutex.synchronize { @enqueued_tokens += 1 }
  @pipe.print(str)
end

#sanitize(str) ⇒ Object (private)



213
214
215
216
217
218
219
220
# File 'lib/tree_tagger/tagger.rb', line 213

def sanitize(str)
  line = str.strip
  if line.empty?
    line = @blank_tag
  end

  line
end

#validate_options(opts) ⇒ Object (private)

Return the options hash after validation.

{
  :binary => nil,
  :model => nil,
  :lexicon => nil,
  :options => '-token -lemma -sgml -quiet',
  :replace_blanks => true,
  :blank_tag => '<BLANK>',
  :lookup => false
}


120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/tree_tagger/tagger.rb', line 120

def validate_options(opts)
  # Check if <:lookup> is boolean.

  # Check if <:replace_blanks> is boolean.

  # Check if <:options> is a string.

  # Check if <:options> contains only allowed values.

  # Ensure that <:options> contains <-sgml>.

  # Check if <:blank_tag> is a string.

  # Ensure that <:blank_tag> is a valid SGML sequence.
  
  # Set the model and binary paths if not provided.
  [:binary, :model].each do |key|
    if opts[key].nil?
      opts[key] = ENV.fetch("TREETAGGER_#{key.to_s.upcase}") do |missing|
        fail UserError, "Provide a value for <:#{key}>" +
          " or set the environment variable <#{missing}>!"
      end
    end
  end
  
  # Set the lexicon path if not provided but requested.
  if opts[:lookup] && opts[:lexicon].nil?
    opts[:lookup] = ENV.fetch('TREETAGGER_LEXICON') do |missing|
      fail UserError, 'Provide a value for <:lexicon>' +
        ' or set the environment variable <TREETAGGER_LEXICON>!'
    end
  end

  # Check for existence and reedability of external files:
  # * binary;
  # * model;
  # * lexicon (if applicable).
  
  opts
end