Class: TermExtractor::NLP

Inherits:
Object
  • Object
show all
Defined in:
lib/term-extractor/nlp.rb

Overview

NLP contains a lot of general NLP related utilities. In particular it contains:

  • a selection of OpenNLP classes

  • a snowball stemmer

  • a stopword list

And various utilities built on top of these.

Constant Summary collapse

JV =
Java::OpennlpToolsLangEnglish
Ending =
/(!|\?|\.)+/
EmbedBoundaries =
[
  ["\"", "\""],
  ["(", ")"],
  ["[", "]"],
  ["{", "}"]
].map{|s| s.map{|x| Regexp.quote(x) }}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(models) ⇒ NLP



52
53
54
55
56
57
58
59
60
# File 'lib/term-extractor/nlp.rb', line 52

def initialize(models)
  @models = models
  @stopwords = Set.new
  
  File.open(loc("stopwords")).each_line do |l|
    l.gsub!(/#.+$/, "")
    @stopwords.add clean_for_stopword(l) 
  end
end

Class Method Details

.clean_sentence(text) ⇒ Object

Once we have split sentences, we clean them up prior to tokenization. We remove or normalize a bunch of noise sources and get it to a form where distinct tokens are separated by whitespace.



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/term-extractor/nlp.rb', line 85

def NLP.clean_sentence(text)
  text = text.dup    
  text.gsub!(/--+/, " -- ") # TODO: What's this for? 

  text.gsub!(/…/, "...") # expand ellipsis character

  # Normalize bracket types.   
  # TODO: Shouldn't do this inside of tokens.
  text.gsub!(/{\[/, "(") 
  text.gsub!(/\}\]/, ")")

  # We turn most forms of punctuation which are not internal to tokens into commas
  punct = /(\"|\(|\)|;|-|\:|-|\*|,)/

  # Convert cunning "smart" apostrophes into plain old boring 
  # dumb ones.
  text.gsub!(/’/, "'")

  text.gsub!(/([\w])\.\.+([\w])/){ "#{$1} , #{$2}"}
  text.gsub!(/(^| )#{punct}+/, " , ")
  text.gsub!(/#{punct}( |$)/, " , ")
  text.gsub!(/(\.+ |')/){" #{$1}"}

  separators = /\//

  text.gsub!(/ #{separators} /, " , ")

  # We can be a bit overeager in turning things into commas, so we clear them up here
  # In particular we remove any we've accidentally added to the end of lines and we collapse
  # consecutive ones into a single one. 
  text.gsub!(/(,|\.) *,/){ " #{$1} " }
  text.gsub!(/(,| )+$/, "")
  text.gsub!(/^(,| )+/, "")

  text.gsub!(/((?:\.|\!|\?)+)$/){" #{$1}" }

  # Clean up superfluous whitespace 
  text.gsub!(/\s+/, " ")
  text
end

.clean_text(text) ⇒ Object



132
133
134
135
136
137
138
139
140
# File 'lib/term-extractor/nlp.rb', line 132

def self.clean_text(text)
  text = text.gsub(/\r(\n?)/, "\n") # Evil microsoft line endings, die die die!
  text.gsub!(/^\s+$/, "") # For convenience, remove all spaces from blank lines
  text.gsub!(/\n\n+/m, ".\n.\n") # Collapse multiple line endings into periods
  text.gsub!(/\n/, " ") # Squash the text onto a single line.
  text.gsub!(/(\d+)\. /){ "#{$1} . " } # We separate out things of the form 1. as these are commonly lists and OpenNLP sentence detection handles them badly
  text.strip!
  text
end

.extract_embedded_sentences(text) ⇒ Object

Normalise a sentence by removing all parenthetical comments and replacing all embedded quotes contained therein Return an array of the sentence and all contained subterms



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/term-extractor/nlp.rb', line 174

def self.extract_embedded_sentences(text)
  text = text.clone
  fragments = [text]

  l = nil
  begin
    l = fragments.length

    EmbedBoundaries.each do |s, e|
      replace = if s == e then "<QUOTE>" else "" end
      matcher = /#{s}[^#{s}#{e}\n]*#{e}/
      text.gsub!(matcher) { |frag| fragments << frag[1..-2]; replace }
    end

  end while fragments.length > l

  if fragments.length > 1
    fragments = fragments.map{|f| extract_embedded_sentences(f) }.flatten
  end
  
  fragments
end

.remove_paths(text) ⇒ Object



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/term-extractor/nlp.rb', line 146

def self.remove_paths(text)
  text = text.clone

  # Fragments of windows paths
  text.gsub!(/[\w:\\]*\\[\w:\\]*/, "<PATH>")

  # fragments of unix paths
  text.gsub!(/\/[\w\/]+/, "<PATH>")
  text.gsub!(/[\w\/]+\//, "<PATH>")
   
  while text.gsub!(/<PATH>\s+\w+\s+<PATH>/, "<PATH>")
    # concatenate fragments where we have e.g. <PATH> and <PATH>
    # into single paths. This is to take into account paths containing spaces.
  end
  
  text.gsub!(/<PATH>(\s*<PATH)*/, "<PATH>")
  text
end

.remove_urls(text) ⇒ Object



142
143
144
# File 'lib/term-extractor/nlp.rb', line 142

def self.remove_urls(text)    
  text.gsub(/\w+:\/\/[^\s]+?(?=\.?(?= |$))/, "<URL>")
end

.tokenize_sentence(string) ⇒ Object



126
127
128
# File 'lib/term-extractor/nlp.rb', line 126

def NLP.tokenize_sentence(string)
  clean_sentence(string).split
end

Instance Method Details

#canonicalize(str) ⇒ Object

Canonicalisation gives a string that in some sense captures the “essential character” of a piece of text. It normalizes it by removing unneccessary words, rearranging, and stripping suffixes. It is not itself intended to be a useful representation of the string, but instead for determining if two strings are equal.



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/term-extractor/nlp.rb', line 67

def canonicalize(str)
  str.
    to_s.
    downcase.
    gsub(/[^\w\s]/, " ").
    split.
    select{|p| !stopword?(p)}.
    map{|p| stem(p) }.
    sort.
    join(" ")
end

#chunk_sentence(sentence) ⇒ Object



240
241
242
243
244
# File 'lib/term-extractor/nlp.rb', line 240

def chunk_sentence(sentence)
  tokens = NLP.tokenize_sentence(sentence)
  postags = postagger.tag(tokens)
  tokens.zip(chunker.chunk(tokens, postags).to_a) 
end

#chunk_text(text) ⇒ Object



234
235
236
237
238
# File 'lib/term-extractor/nlp.rb', line 234

def chunk_text(text)
  result = []
  sentences(text).each{|x| result += chunk_sentence(x)} 
  result
end

#chunkerObject



39
40
41
# File 'lib/term-extractor/nlp.rb', line 39

def chunker
  @chunker ||= JV::TreebankChunker.new(loc("chunk.bin.gz"))
end

#each_sentence(source) ⇒ Object



201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/term-extractor/nlp.rb', line 201

def each_sentence(source)
  lines = []

  process_lines = lambda{
    text = lines.join("\n").strip 
    if text != ""
      sentences(text).each{|s| yield(s.gsub("\n", " ")) }
    end
    lines = []
  }

  source.each_line do |line|
    line = line.strip

    if line == ""   
      process_lines.call   
    end

    lines << line
  end

  process_lines.call
end

#postag(tokens) ⇒ Object



225
226
227
228
229
230
231
232
# File 'lib/term-extractor/nlp.rb', line 225

def postag(tokens)
  if tokens.is_a? String
    tokens = NLP.tokenize_sentence(tokens)
  else
    tokens = tokens.to_a
  end
  tokens.zip(postagger.tag(tokens).to_a) 
end

#postaggerObject



35
36
37
# File 'lib/term-extractor/nlp.rb', line 35

def postagger
  @postagger ||= JV::PosTagger.new(loc("tag.bin.gz"), tagdict)
end

#sentdetectObject



27
28
29
# File 'lib/term-extractor/nlp.rb', line 27

def sentdetect 
  @sentdetect ||= JV::SentenceDetector.new(loc("sd.bin.gz"))
end

#sentences(string) ⇒ Object



197
198
199
# File 'lib/term-extractor/nlp.rb', line 197

def sentences(string)
  sentdetect.sentDetect(NLP.clean_text(string)).to_a.map{|s| s.strip }.select{|s| (s.length > 0) && !(s =~ /^(\.|!|\?)+$/) }
end

#stem(word) ⇒ Object



21
22
23
24
25
# File 'lib/term-extractor/nlp.rb', line 21

def stem(word)
  stemmer.setCurrent(word)
  stemmer.stem
  stemmer.getCurrent 
end

#stemmerObject



47
48
49
# File 'lib/term-extractor/nlp.rb', line 47

def stemmer 
  @stemmer ||= EnglishStemmer.new 
end

#stopword?(word) ⇒ Boolean



79
80
81
# File 'lib/term-extractor/nlp.rb', line 79

def stopword?(word) 
  stopwords.include?(clean_for_stopword(word))
end

#stopwordsObject



43
44
45
# File 'lib/term-extractor/nlp.rb', line 43

def stopwords 
  @stopwords
end

#tagdictObject



31
32
33
# File 'lib/term-extractor/nlp.rb', line 31

def tagdict
  @tagdict ||= Java::OpennlpToolsPostag::POSDictionary.new(loc("tagdict"), true)
end