Class: Banner

Inherits:
NER
  • Object
show all
Defined in:
lib/rbbt/ner/banner.rb

Overview

Offers a Ruby interface to the Banner Named Entity Recognition Package in Java. Banner.

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from NER

#entities

Constructor Details

#initialize(modelfile = Rbbt.software.opt.BANNER["gene_model.bin"].find, lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find, taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find) ⇒ Banner

The parameters are set to default values, the only one that one might want to change is the modelfile to point to a custom trained one.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/rbbt/ner/banner.rb', line 27

def initialize(modelfile = Rbbt.software.opt.BANNER["gene_model.bin"].find,
               lemmadir  = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
               taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
              )
  Banner.init

  @tokenizer = @@SimpleTokenizer.new

  model = @@JFile.new(modelfile)
  lemma =  @@EngLemmatiser.new(lemmadir,false,true)
  helper =  @@HeppleTagger.new(taggerdir)

  # The next lines are needed to avoid colisions with
  # metraprograming that could define load (activesupport in
  # particular :@ ). RJB seems to call java on method missing
  class << @@CRFTagger
    if method_defined? :load 
      undef_method :load 
    end
  end  

  @tagger    = @@CRFTagger.load( model, lemma, helper)
  @parenPP   = @@ParenthesisPostProcessor.new()
end

Class Method Details

.initObject



12
13
14
15
16
17
18
19
20
# File 'lib/rbbt/ner/banner.rb', line 12

def self.init
  @@JFile                    ||= Rjb::import('java.io.File')
  @@SimpleTokenizer          ||= Rjb::import('banner.tokenization.SimpleTokenizer')
  @@CRFTagger                ||= Rjb::import('banner.tagging.CRFTagger')
  @@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
  @@HeppleTagger             ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
  @@Sentence                 ||= Rjb::import('banner.Sentence')
  @@EngLemmatiser            ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
end

Instance Method Details

#match(text) ⇒ Object

Returns an array with the mention found in the provided piece of text.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/rbbt/ner/banner.rb', line 55

def match(text)
  return [] if text.nil? 
  text.gsub!(/\n/,' ')
  text.gsub!(/\|/,'/') # Character | gives an error
  return [] if text.strip.empty? 
  text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
  sentence = @@Sentence.new(text)

  @tokenizer.tokenize(sentence)
  @tagger.tag(sentence)
  @parenPP.postProcess(sentence)
  tagged = sentence.getSGML

  res = tagged.scan(/<GENE>.*?<\/GENE>/).
    collect{|r|
    r.match(/<GENE>(.*?)<\/GENE>/)
    mention = $1
    mention.sub!(/^\s*/,'')
    mention.sub!(/\s*$/,'')
    offset = text.index(mention)
    NamedEntity.setup(mention, offset, 'GENE')
    mention
  }
  res
end