Module: OpenNLP

Defined in:
lib/rbbt/nlp/open_nlp/sentence_splitter.rb

Constant Summary collapse

MAX =
5
@@FileInputStream =
Rjb::import('java.io.FileInputStream')
@@SentenceModel =
Rjb::import('opennlp.tools.sentdetect.SentenceModel')
@@SentenceDetectorME =
Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')

Class Method Summary collapse

Class Method Details

.sentence_split_detectorObject



17
18
19
20
21
22
23
24
25
26
27
# File 'lib/rbbt/nlp/open_nlp/sentence_splitter.rb', line 17

def self.sentence_split_detector
  @@sentence_split_detector ||= begin
                                  modelIn = @@FileInputStream.new(Rbbt.software.opt.OpenNLP.models["da-sent.bin"].produce.find);

                                  model = @@SentenceModel.new(modelIn);
                                  modelIn.close()
                                  model

                                  @@SentenceDetectorME.new(model)
                                end
end

.sentence_splitter(text) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/rbbt/nlp/open_nlp/sentence_splitter.rb', line 29

def self.sentence_splitter(text)
  return [] if text.nil? or text.empty?

  text = Misc.to_utf8(text)
  last = 0
  begin
    sentence_split_detector = self.sentence_split_detector
    
    sentences = nil
    TmpFile.with_file do |tmpfile|
      start_time = Time.now

      begin
        pid = Process.fork do
          sent = sentence_split_detector.sentDetect(text)
          Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
        end

        while not Process.waitpid(pid)
          if Time.now - start_time > MAX
            Process.kill(9, pid)
            raise "Taking to long (> #{MAX} seconds)"
          end
          sleep 0.1
        end

        begin
          Process.waitpid(pid)
        end
      rescue Errno::ECHILD
      end

      sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
    end

    sentences.collect{|sentence|
      sentence = Misc.to_utf8(sentence)
      start = text.index(sentence, last)
      Segment.setup sentence, start
      last = start + sentence.length - 1
      sentence
    }
  rescue Exception
    raise $!
    raise "Sentence splitter raised exception: #{$!.message}"
  end
end