Class: TermExtractor::TermContext

Inherits:
Object
  • Object
show all
Defined in:
lib/term-extractor.rb

Overview

This class holds all the state needed for term calculations on a single sentence. It uses chunking and part of speech tagging information to mark each token in the sentence as to whether it is allowed to start a term or end a term and whether terms can cross it Terms are then calculated by simply looking for all sequences of tokens up to the maximum length which meet these constraints.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parent, sentence) ⇒ TermContext

Returns a new instance of TermContext.



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/term-extractor.rb', line 49

def initialize(parent, sentence)
  @parent = parent
  sentence = NLP.clean_sentence(sentence)

  # User defineable cleaning.
  sentence = NLP.remove_urls(sentence) if parent.remove_urls
  sentence = NLP.remove_paths(sentence) if parent.remove_paths


  @tokens = NLP.tokenize_sentence(sentence)
  @postags = nlp.postagger.tag(tokens)
  @chunks = nlp.chunker.chunk(tokens, postags)


  @sentence = sentence

end

Instance Attribute Details

#chunksObject

Returns the value of attribute chunks.



43
44
45
# File 'lib/term-extractor.rb', line 43

def chunks
  @chunks
end

#parentObject

Returns the value of attribute parent.



43
44
45
# File 'lib/term-extractor.rb', line 43

def parent
  @parent
end

#postagsObject

Returns the value of attribute postags.



43
44
45
# File 'lib/term-extractor.rb', line 43

def postags
  @postags
end

#tokensObject

Returns the value of attribute tokens.



43
44
45
# File 'lib/term-extractor.rb', line 43

def tokens
  @tokens
end

Instance Method Details

#boundariesObject

This is the bit where all the work happens



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/term-extractor.rb', line 68

def boundaries
  return @boundaries if @boundaries

  # To each token we assign three attributes which determine how it may occur within a term.
  # can_cross determines if this token can appear internally in a term
  # can_start determines if a term is allowed to start with this token
  # can_end determines if a term is allowed to end with this token
  @boundaries = tokens.map{|t| {}}

  @boundaries.each_with_index do |b, i|  
    # WARNING: It's important to only write boundaries for indices 
    # <= i. Otherwise the next loop iteration will overwrite the 
    # set value.
    

    tok = tokens[i]
    pos = postags[i]
    chunk = chunks[i]

    # Cannot cross commas or coordinating conjections (and, or, etc)
    b[:can_cross] = !(pos =~ /,/)

    # words which are extra double plus stop wordy and shouldn't appear inside
    # terms
    # FIXME: This is a hack. We're really hitting the limit of
    # rule based systems here
    b[:can_cross] &&= ![
      "after", 
      "where",
      "when",
      "for",
      "at",
      "to",
      "with"
    ].include?(tok)
 
    # Cannot cross the beginning of verb terms
    # i.e. we may start with verb terms but not include them
    b[:can_cross] = (chunk != "B-VP") if b[:can_cross]
    
    # We generate tags like <PATH>, <URL> and <QUOTE>
    # to encapsulate various sorts of noise strings. 
    b[:can_cross] &&= !(tok =~ /<\w+>/)

    # We are only allowed to start terms on the beginning of a term chunk
    b[:can_start] = (chunks[i] == "B-NP")

    # In some cases we want to move the start of a term to the right. These cases are:
    # - a determiner (the, a, etc)
    # - a posessive pronoun (my, your, etc) 
    # - comparative and superlative adjectives (best, better, etc.)
    # - A number. In this case note that starting with the number is also allowed. e.g. "two cities" will produce both "two cities"
    # In all cases we only do this for noun terms, and will only move them to internal points.
    if (chunks[i] == "I-NP") && (postags[i-1] =~ /DT|WDT|PRP|JJR|JJS|CD/)
        b[:can_start] = true 
    end

    # We must include any tokens internal to the current chunk
    b[:can_end] = !(chunks[i + 1] =~ /I-/)

    # We break phrases around coordinating conjunctions (and, or, etc)
    # but allow phrases that should rightfully be forced to continue past
    # the conjunction. e.g. in "nuts and bolts", we allow "nuts" and "bolts" 
    # but not the whole phrase. This is true even if this resolves as a single
    # chunk
    if pos == 'CC'
      @boundaries[i-1][:can_end] = true if i > 0        
      @boundaries[i][:can_cross] = false
    end   
    # need to do it here rather than in previous if statement
    # as otherwise the next pass along will overwrite the result
    # we set here.
    if i > 0 && @postags[i-1] == 'CC'
      @boundaries[i][:can_start] = true
    end

    # It is permitted to cross stopwords, but they cannot lie at the term boundary
    if (nlp.stopword? tok) || (nlp.stopword? tokens[i..i+1].join) # Need to take into account contractions, which span multiple tokens
      b[:can_end] = false
      b[:can_start] = false
    end

    # The presence of a ' at the start of a token is most likely an indicator that we've
    # split across a contraction. e.g. would've -> would 've. We are not allowed to 
    # cross this transition point.
    if tok =~ /^'/
      b[:can_start] = false
      @boundaries[i - 1][:can_end] = false
    end

    # Common sources of crap starting words
    b[:can_start] &&= !(pos =~ /CC|PRP|IN|DT|PRP\$|WP|WP\$|TO|EX|JJR|JJS/)

    # TODO: Is this still a good idea?
    b[:can_end] &&= (pos =~ /NN|NNS|NNP|NNPS|FW|CD/)

  end

  @boundaries
end

#nlpObject



45
46
47
# File 'lib/term-extractor.rb', line 45

def nlp
  parent.nlp
end

#termsObject



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/term-extractor.rb', line 169

def terms
  return @terms if @terms

  @terms = []

  i = 0
  j = 0
  while(i < tokens.length)
    if !boundaries[i][:can_start] || !boundaries[i][:can_cross]
      i += 1
      next
    end

    j = i if j < i

    if (j == tokens.length) || !boundaries[j][:can_cross] || (j >= i + parent.max_term_length)
      i += 1
      j = i
      next
    end

    if !boundaries[j][:can_end]
      j += 1
      next
    end

    term = tokens[i..j]
    poses = postags.to_a[i..j]
    term = Term.new(term){ |it|
      it.pos = poses.join("-")
      it.chunks = chunks.to_a[i..j]
    }
    terms << term if TermExtractor.allowed_term?(term)

    j += 1
  end

  @terms
end