Module: NLP

Extended by:: LocalPersist

Defined in:: lib/rbbt/nlp/nlp.rb,
lib/rbbt/nlp/genia/sentence_splitter.rb

Defined Under Namespace

Constant Summary collapse

NEW_LINE_MASK =

"\t\t \t  \t"

Class Method Summary collapse

Class Method Details

.event_extraction(text) ⇒ `Object`

# File 'lib/rbbt/nlp/genia/sentence_splitter.rb', line 92

def self.event_extraction(text)
  events = ""
  marks = ""

  eventCount = 0

  pat = / [^ ]+[.!\?\)\]\"]( +)[^ ]+ /
  for line in text.split(/\n/) do
    while line.match(pat) do
      line.sub!(/ ([^ ]+)([.!\?\)\]\"])( +)([^ ]+) /){
        a, b, d, c = $1, $2, $3, $4
        events << eventCount.to_s  << "\t"
        events << returnFeatures(a, b, c)
        (" " << a << b << "__" << eventCount.to_s << "____" << d << "__" << c << " ")
      }
      eventCount += 1
    end
    marks << line
  end

  [events, marks]
end

.gdep_chunk_sentences(sentences) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 214

def self.gdep_chunk_sentences(sentences)
  sentences = Array === sentences ? sentences : [sentences]
  NLP.gdep_parse_sentences_extension(sentences).zip(sentences).collect do |segment_list, sentence|
    chunk_list = NLP.gdep_chunks(sentence, segment_list)
    NLP.merge_vp_chunks(chunk_list)
  end
end

.gdep_chunks(sentence, segment_list) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 122

def self.gdep_chunks(sentence, segment_list)
  chunks = []

  chunk_start = "B"[0]
  chunk_inside = "I"[0]

  last = GdepToken.setup("LW")

  chunk_segments = []
  segment_list.each do |segment|
    if segment.chunk[0] == chunk_inside and not segment.offset.nil?
      chunk_segments << segment
    else
      if chunk_segments.any?
        cstart = chunk_segments.first.offset
        cend = chunk_segments.last.end
        chunk = sentence[cstart..cend]
        GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
        chunks << chunk
      end

      if segment.offset.nil?
        chunk_segments = []
      else
        chunk_segments = [segment]
      end
    end
    last = segment
  end

  if chunk_segments.any?
    cstart = chunk_segments.first.offset
    cend = chunk_segments.last.end
    chunk = sentence[cstart..cend]
    GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
    chunks << chunk
  end

    
  chunks
end

.gdep_parse_sentences(sentences) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 164

def self.gdep_parse_sentences(sentences)
  sentences = Array === sentences ? sentences : [sentences]

  input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
  sentence_tokens = TmpFile.with_file(input) do |fin|
    out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
      CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
    end

    out.split(/^$/).collect do |sentence|
      tokens = sentence.split(/\n/).collect do |line|
        next if line.empty?
        num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
        GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
      end.compact
    end
  end

  sentences.zip(sentence_tokens).collect do |sentence, tokens|
    Segment.align(sentence, tokens)
  end
end

.gdep_parse_sentences_extension(sentences) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 188

def self.gdep_parse_sentences_extension(sentences)
  require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
  gdep = Gdep.new
  if not gdep.gdep_is_loaded
    Misc.in_dir Rbbt.software.opt.Gdep.find do
      gdep.load_gdep 
    end
  end

  sentences = Array === sentences ? sentences : [sentences]

  sentence_tokens = sentences.collect{|sentence|
    Gdep.new.tag(sentence).split(/\n/).collect do |line|
      next if line.empty?
      token, lemma, pos, chunk = line.split(/\t/)
      GdepToken.setup(token, nil, nil, lemma, chunk, pos)
      token
    end.compact
  }

  sentences.zip(sentence_tokens).collect do |sentence, tokens|
    Segment.align(sentence, tokens)
    tokens
  end
end

.geniass_sentence_splitter(text) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 27

def self.geniass_sentence_splitter(text)
  offsets = []

  cleaned = text.gsub("\n",NEW_LINE_MASK)
  TmpFile.with_file(cleaned) do |fin|
    TmpFile.with_file do |fout|
      CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")

      
      Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
      Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
      # Addapted from sentence2standOff.rb in Geniass package

      inTxtStrict = Open.open(fin)
      inTxtNew = Open.open(fout)

      marker = "|"[0]
      position = 0
      sentenceCount = 1
      target = ''
      targetNew = ''
      start = 0
      finish = 0

      while(!inTxtNew.eof?) do
        targetNew = inTxtNew.getc
        target = inTxtStrict.getc
        position += 1
        if targetNew == marker
          sentenceCount += 1
          finish = position - 1
          offsets << [start, finish] if finish - start > 10
          if targetNew == target
            start = position
          else
            targetNew = inTxtNew.getc
            while targetNew != target do
              target = inTxtStrict.getc
              position += 1
            end
            start = position - 1
          end
        end
      end

      finish = position - 1
      offsets << [start, finish] if finish > start

      inTxtStrict.close
      inTxtNew.close
    end
  end

  offsets.collect do |s,e|
    sentence = text[s..e]
    next if sentence.nil?
    #sentence.gsub!(NEW_LINE_MASK, "\n")
    Segment.setup sentence, s
    sentence
  end
end

.geniass_sentence_splitter_extension(text) ⇒ `Object`

# File 'lib/rbbt/nlp/genia/sentence_splitter.rb', line 171

def self.geniass_sentence_splitter_extension(text)
  Rbbt.software.opt.Geniass.produce
  require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
  geniass = Geniass.new
  if not geniass.geniass_is_loaded
    Misc.in_dir Rbbt.software.opt.Geniass.find do
      geniass.load_geniass
    end
  end

  cleaned = text.gsub("\n",NEW_LINE_MASK)
  events, marks = event_extraction(cleaned)

  labels = events.split(/\n/).collect{|line| 
    geniass.label(line)
  }

  out = process_labels(marks, labels)

  offsets = []

  inTxtStrict = StringIO.new text
  inTxtNew = StringIO.new out.gsub("\n", '|').gsub(NEW_LINE_MASK, "\n")

  marker = "|"[0]
  position = 0
  sentenceCount = 1
  target = ''
  targetNew = ''
  start = 0
  finish = 0

  while(!inTxtNew.eof?) do
    targetNew = inTxtNew.getc
    target = inTxtStrict.getc
    position += 1
    if targetNew == marker
      sentenceCount += 1
      finish = position - 1
      offsets << [start, finish] if finish - start > 10
      if targetNew == target
        start = position
      else
        targetNew = inTxtNew.getc
        while targetNew != target do
          target = inTxtStrict.getc
          position += 1
        end
        start = position - 1
      end
    end
  end

  finish = position - 1
  offsets << [start, finish] if finish > start

  inTxtStrict.close
  inTxtNew.close

  offsets.collect do |s,e|
    sentence = text[s..e]
    next if sentence.nil?
    Segment.setup sentence, s
    sentence
  end

end

.merge_vp_chunks(chunk_list) ⇒ `Object`

# File 'lib/rbbt/nlp/nlp.rb', line 101

def self.merge_vp_chunks(chunk_list)
  vp = nil
  new_chunks = []
  chunk_list.each do |chunk|
    if chunk.type =~ /^VP/
      if vp.nil?
        vp = chunk
      else
        vp << chunk
        vp.parts.concat chunk.parts
      end
    else
      new_chunks << vp if not vp.nil?
      new_chunks << chunk
      vp = nil
    end
  end

  new_chunks
end

.process_labels(marked_text, labels) ⇒ `Object`

# File 'lib/rbbt/nlp/genia/sentence_splitter.rb', line 136

def self.process_labels(marked_text, labels)
  out = ""

  count = 0
  text_lines = marked_text.split(/\n/)
  line = text_lines.shift
  for label in labels
    pat = "__" + count.to_s + "__"
    until(line.match(pat)) do
      out << line
      line = text_lines.shift
    end
    splitted = label.chomp.to_i

    line.sub!(pat){
      if splitted == 1
        "__\n__"
      else
        "____"
      end
    }
    line.sub!(/__\n____ +__/, "\n")
    line.sub!(/______( +)__/){
      $1
    }
    count += 1
  end

  out << line

  out << text_lines * ""

  out
end

.returnFeatures(prevWord, delimiter, nextWord) ⇒ `Object`

# File 'lib/rbbt/nlp/genia/sentence_splitter.rb', line 4

def self.returnFeatures(prevWord, delimiter, nextWord)
  if nextWord.match(/__ss__/)
    nw = nextWord.sub(/__ss__/, "")
  else
    nw = nextWord
  end

  str = ""
  # prev. word, next word
  str += "pw_" + prevWord.downcase
  str += "\tnw_" + nw.downcase

  # delimiter
  str += "\td_" + delimiter

  # capitalized first char in next word
  # capital in next word excluding first char.
  if nw[0].chr == nw[0].chr.capitalize
    str += "\tnfc_y"
    nwExcluginFirst = nw[1 ... -1]
    if nwExcluginFirst == nil
      str += "\tnwcef_n"
    elsif nwExcluginFirst.downcase == nwExcluginFirst
      str += "\tnwcef_n"
    else
      str += "\tnwcef_y"
    end
  else
    if nw.downcase == nw
      str += "\tnwcef_n"
    else
      str += "\tnwcef_y"
    end
    str += "\tnfc_n"
  end

  # prev. word capital
  if prevWord.downcase == prevWord
    str += "\tpwc_n"
  else
    str += "\tpwc_y"
  end

  # number in prev. word, in next word
  if prevWord.match(/[0-9]/)
    str += "\tpwn_y"
  else
    str += "\tpwn_n"
  end
  if nw.match(/[0-9]/)
    str += "\tnwn_y"
  else
    str += "\tnwn_n"
  end

  # prev., next word excluding braket, camma, etc.
  prevWordEx = prevWord.gsub(/[()'",\[\]]/, "")
  nwEx = nw.gsub(/[()'",\[\]]/, "")
  str += "\tpwex_" + prevWordEx.downcase
  str += "\tnwex_" + nwEx.downcase

  # bracket or quatation in prev. word
  if prevWord.match(/()'"/)
    str += "\tpwcbq_y"
  else
    str += "\tpwcbq_n"
  end
  # camma in prev., next word
  if prevWord.match(/,/)
    str += "\tpwcc_y"
  else
    str += "\tpwcc_n"
  end
  if nw.match(/,/)
  else
    str += "\tnwcc_n"
  end

  # prev. word + delimiter
  str += "\tpw_" + prevWord + "_d_" + delimiter
  # prev. word ex. +  delimiter + next word ex.
  str += "\tpwex_" + prevWordEx + "_d_" + delimiter + "_nwex_" + nwEx
  #str +=
  #str +=
  #str +=
  str += "\n"
end

Module: NLP

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.event_extraction(text) ⇒ Object

.gdep_chunk_sentences(sentences) ⇒ Object

.gdep_chunks(sentence, segment_list) ⇒ Object

.gdep_parse_sentences(sentences) ⇒ Object

.gdep_parse_sentences_extension(sentences) ⇒ Object

.geniass_sentence_splitter(text) ⇒ Object

.geniass_sentence_splitter_extension(text) ⇒ Object

.merge_vp_chunks(chunk_list) ⇒ Object

.process_labels(marked_text, labels) ⇒ Object

.returnFeatures(prevWord, delimiter, nextWord) ⇒ Object

.event_extraction(text) ⇒ `Object`

.gdep_chunk_sentences(sentences) ⇒ `Object`

.gdep_chunks(sentence, segment_list) ⇒ `Object`

.gdep_parse_sentences(sentences) ⇒ `Object`

.gdep_parse_sentences_extension(sentences) ⇒ `Object`

.geniass_sentence_splitter(text) ⇒ `Object`

.geniass_sentence_splitter_extension(text) ⇒ `Object`

.merge_vp_chunks(chunk_list) ⇒ `Object`

.process_labels(marked_text, labels) ⇒ `Object`

.returnFeatures(prevWord, delimiter, nextWord) ⇒ `Object`