Class: WordnetBasedSimilarity

Inherits:
Object
  • Object
show all
Defined in:
lib/automated_metareview/wordnet_based_similarity.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#countObject

Returns the value of attribute count.



5
6
7
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 5

def count
  @count
end

#matchObject

Returns the value of attribute match.



5
6
7
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 5

def match
  @match
end

Instance Method Details

#check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type) ⇒ Object

This method compares the submission and reviews’ synonyms and antonyms with each others’ tokens and stem values.

The instance variables 'match' and 'count' are updated accordingly.


323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 323

def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type)
  flag = 0 #indicates if a match was found
  # puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}")
  # puts "rev_arr #{rev_arr}"
  # puts "subm_arr #{subm_arr}"
  if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or 
    (!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem))))          
    # puts("Match found between: #{rev_token} & #{subm_token}")
    flag = 1 #setting the flag to indicate that a match was found
    if(rev_state == subm_state)
      @match = @match + match_type
    elsif(rev_state != subm_state)
      @match = @match+ non_match_type
    end
    @count+=1
  end
  if(flag == 1)
    return true
  else
    return false
  end
end

#compare_strings(reviewVertex, submVertex, speller) ⇒ Object

@@posTagger = EngTagger.new



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 7

def compare_strings(reviewVertex, submVertex, speller)
  #must fix this to something that is local to the app
  # WordNet::WordNetDB.path = "/usr/local/WordNet-3.0"
  # WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0"
  review = reviewVertex.name
  submission = submVertex.name
  reviewState = reviewVertex.state
  submState = submVertex.state
  
  # puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}");
  @match = 0
  @count = 0
  
  reviewPOS = ""
  submPOS = ""
   
  #checking for exact matches between the tokens
  if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value
    # puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}")   
    if(reviewState.equal?(submState))
      @match = @match + EXACT
    elsif(!reviewState.equal?(submState))
      @match = @match + NEGEXACT
    end
    return @match
  end   
  
  stokRev = review.split(" ")
  #stokSub = submission.split(" ") #should've been inside when doing n * n comparison
  
  #iterating through review tokens
  for i in (0..stokRev.length-1)
    #if either of the tokens is null
    if(stokRev[i].nil?)
      next #continue with the next token
    end
    revToken = stokRev[i].downcase()
    if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
      reviewPOS = determine_POS(reviewVertex).strip
    end
    
    # puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}")
    if(revToken.equal?("n't"))
      revToken = "not"
      # puts("replacing n't")
    end
    
    #if the review token is a frequent word, continue
    if(is_frequent_word(revToken))
      # puts("Skipping frequent review token .. #{revToken}")
      next #equivalent of the "continue"
    end
    
    #fetching synonyms, hypernyms, hyponyms etc. for the review token       
    revStem = find_stem_word(revToken, speller)     
    #fetching all the relations
    review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS)
    #setting the values in specific array variables
    revGloss = review_relations[0]
    revSyn =review_relations[1]
    revHyper = review_relations[2]
    revHypo = review_relations[3]
    revAnt = review_relations[4]
    
    # puts "reviewStem:: #{revStem} .. #{revStem.class}" 
    # puts "reviewGloss:: #{revGloss} .. #{revGloss.class}"  
    # puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}"
    # puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}"
    # puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}"
    # puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}"
      
    stokSub = submission.split(" ")
    #iterating through submission tokens
    for j in (0..stokSub.length-1)
    
      if(stokSub[i].nil?)
        next
      end
      
      subToken = stokSub[j].downcase()
      if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v)
        submPOS = determine_POS(submVertex).strip
      end
      
      # puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}")
      if(subToken.equal?("n't"))
        subToken = "not"
        # puts("replacing n't")
      end
      
      #if the review token is a frequent word, continue
      if(is_frequent_word(subToken))
        # puts("Skipping frequent subtoken .. #{subToken}")
        next #equivalent of the "continue"
      end
                  
      #fetching synonyms, hypernyms, hyponyms etc. for the submission token
      submStem = find_stem_word(subToken, speller)
      subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS)
      submGloss = subm_relations[0]
      submSyn =subm_relations[1]
      submHyper = subm_relations[2]
      submHypo = subm_relations[3]
      submAnt = subm_relations[4]  
      # puts "submStem:: #{submStem}"        
      # puts "submGloss:: #{submGloss}"
      # puts "submSynonyms:: #{submSyn}"
      # puts "submHypernyms:: #{submHyper}"
      # puts "submHyponyms:: #{submHypo}"
      # puts "submAntonyms:: #{submAnt}" 
        
      #------------------------------------------
      #checks are ordered from BEST to LEAST degree of semantic relatedness
      #*****exact matches 
      # puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}"  
      # puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}"
      # puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}"     
      if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase()))
        # puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}")
        if(reviewState.equal?(submState))
          @match = @match + EXACT
        elsif(!reviewState.equal?(submState))
          @match = @match + NEGEXACT
        end
        @count+=1
        next #skip all remaining checks
      end #end of if condition checking for exact matches
      #------------------------------------------
      #*****For Synonyms
      #if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped
      if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM))
        next
      end
      #------------------------------------------
      #ANTONYMS
      if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM))
        next
      end
      #------------------------------------------
      #*****For Hypernyms
      if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM))
        next
      end
      #------------------------------------------   
      #*****For Hyponyms
      if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM))
        next
      end
       
      #overlap across definitions   
      # checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review
      # or submission token or stem.
      # puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}"
      # puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}"
      # rev_def = extract_definition(revGloss)
      # sub_def = extract_definition(submGloss) 
      #(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or
      if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or 
        (!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem))))
        if(reviewState == submState)
          @match = @match + OVERLAPDEFIN
        elsif(reviewState != submState)
          @match = @match + NEGOVERLAPDEFIN
        end
        @count+=1
        next
      end
      
      #no match found!
      # puts "No Match found!"
      @match = @match + NOMATCH
      @count+=1
    end #end of the for loop for submission tokens 
  end #end of the for loop for review tokens
  
  if(@count > 0)
    # puts ("Match: #{@match} Count:: #{@count}")
    result = (@match.to_f/@count.to_f).round
    # puts("@@@@@@@@@ Returning Value: #{result}")
    return result #an average of the matches found
  end
  # puts("@@@@@@@@@ Returning NOMATCH")
  return NOMATCH
  
end

#determine_POS(vert) ⇒ Object

determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word



351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 351

def determine_POS(vert)
  str_pos = vert.pos_tag
  # puts("Inside determine_POS POS Tag:: #{str_pos}")
  if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP"))
    pos = "n"#WordNet::Noun
  elsif(str_pos.include?("JJ"))
    pos = "a" #WordNet::Adjective
  elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD"))
    pos = "v" #WordNet::Verb
  elsif(str_pos.include?("RB"))
    pos = "r" #WordNet::Adverb
  else
    pos = "n" #WordNet::Noun
  end
  return pos
end

#extract_definition(glosses) ⇒ Object

This method is used to extract definitions for the words (since glossed contain definitions and examples!)

glosses - string containing the gloss of the synset


415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 415

def extract_definition(glosses)
  definitions = ""#[]
  #extracting examples from definitions
  temp = glosses
  tempList = temp.split(";")
  for i in 0..tempList.length - 1
    if(!tempList[i].include?('"'))
      if(definitions.empty?)
        definitions = tempList[i]
      else
        definitions = definitions +" "+ tempList[i]
      end
    end
  end
  #puts definitions
  return definitions
end

#find_stem_word(word, speller) ⇒ Object

find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck

It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!


394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 394

def find_stem_word(word, speller)
  stem = word.stem
  correct = stem #initializing correct to the stem word
  #checkiing the stem word's spelling for correctness
  while(!speller.check(correct)) do
    if(!speller.suggest(correct).first.nil?)
      correct = speller.suggest(correct).first
    else
      #break out of the loop, else it will continue infinitely
      break #break out of the loop if the first correction was nil
    end
  end
  return correct
end

#get_relations_for_review_submission_tokens(token, stem, pos) ⇒ Object

This method fetches the synonyms, hypernyms, hyponyms and other relations for the ‘token’ and its stem ‘stem’.

This is done for both review and submission tokens/stems.
It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.


200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 200

def get_relations_for_review_submission_tokens(token, stem, pos)
  # puts "@@@@ Inside get_relations_for_review_submission_tokens"
  relations = Array.new
  lemmas = WordNet::WordNetDB.find(token)
  if(lemmas.nil?)
    lemmas = WordNet::WordNetDB.find(stem) 
  end
  #select the lemma corresponding to the token's POS
  lemma = ""
  lemmas.each do |l|
    # puts "lemma's POS :: #{l.pos} and POS :: #{pos}"
    if(l.pos == pos)
      lemma = l
      break
    end  
  end
      
  def_arr = Array.new
  syn_arr = Array.new
  hyper_arr = Array.new
  hypo_arr = Array.new
  anto_arr = Array.new
        
  #if selected reviewLemma is not nil or empty
  if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?)      
    #creating arrays of all the values for synonyms, hyponyms etc. for the review token
    for g in 0..lemma.synsets.length - 1
      #fetching the first review synset
      lemma_synset = lemma.synsets[g]
      
      #definitions
      if(!lemma_synset.gloss.nil?)
        #puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}"
        if(def_arr[0].nil?)
          def_arr << extract_definition(lemma_synset.gloss)
        else
          def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss)
        end
      else
        def_arr << nil
      end
      
      #looking for all relations synonym, hypernym, hyponym etc. from among this synset
      #synonyms
      begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using)
        lemmaSyns = lemma_synset.get_relation("&")
        if(!lemmaSyns.nil? and lemmaSyns.length != 0)
          # puts "lemmaSyns.length #{lemmaSyns.length}"
          #for each synset get the values and add them to the array
          for h in 0..lemmaSyns.length - 1
            # puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}"
            syn_arr = syn_arr + lemmaSyns[h].words
            # puts "**** syn_arr #{syn_arr}"
          end
        else
          syn_arr << nil #setting nil when no synset match is found for a particular type of relation
        end
      rescue
        syn_arr << nil
      end
      
      #hypernyms
      begin
        lemmaHypers = lemma_synset.get_relation("@")#hypernym.words
        if(!lemmaHypers.nil? and lemmaHypers.length != 0)
          #for each synset get the values and add them to the array
          for h in 0..lemmaHypers.length - 1
            #puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}"
            hyper_arr = hyper_arr + lemmaHypers[h].words
          end
        else
          hyper_arr << nil
        end
      rescue
        hyper_arr << nil
      end
      
      #hyponyms
      begin
        lemmaHypos = lemma_synset.get_relation("~")#hyponym
        if(!lemmaHypos.nil? and lemmaHypos.length != 0)
          #for each synset get the values and add them to the array
          for h in 0..lemmaHypos.length - 1
            hypo_arr = hypo_arr + lemmaHypos[h].words
          end
        else
          hypo_arr << nil
        end
      rescue
        hypo_arr << nil
      end
      
      #antonyms
      begin
        lemmaAnts = lemma_synset.get_relation("!")
        if(!lemmaAnts.nil? and lemmaAnts.length != 0)
          #for each synset get the values and add them to the array
          for h in 0..lemmaAnts.length - 1
            anto_arr = anto_arr + lemmaAnts[h].words
          end
        else
          anto_arr << nil
        end
      rescue
        anto_arr << nil
      end         
    end #end of the for loop for g  
  end #end of checking if the lemma is nil or empty

  #setting the array elements before returning the array
  relations << def_arr
  relations << syn_arr
  relations << hyper_arr
  relations << hypo_arr
  relations << anto_arr
  return relations
end

#is_frequent_word(word) ⇒ Object

is_frequent_word - method checks to see if the given word is a frequent word



372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 372

def is_frequent_word(word)
  word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution
  word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable
  word.gsub!("[", "")
  word.gsub!("]", "")
  word.gsub!("\"", "")

  if(FREQUENT_WORDS.include?(word))
    return true
  end

  if(CLOSED_CLASS_WORDS.include?(word))
    return true
  end  
  
  return false
end

#overlap(def1, def2, speller) ⇒ Object




434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 434

def overlap(def1, def2, speller)
  instance = WordnetBasedSimilarity.new
  numOverlap = 0
  #only overlaps across the ALL definitions
  # puts "def1 #{def1}"
  # puts "def2 #{def2}"
  
  #iterating through def1's definitions
  for i in 0..def1.length-1
    if(!def1[i].nil?)
      #puts "def1[#{i}] #{def1[i]}"
      if( def1[i].include?("\""))
        def1[i].gsub!("\"", " ")
      end
      if(def1[i].include?(";"))
        def1[i] = def1[i][0..def1[i].index(";")]
      end
      #iterating through def2's definitions
      for j in 0..def2.length - 1   
        if(!def2[j].nil?)
          if(def2[j].include?(";"))
            def2[j] = def2[j][0..def2[j].index(";")]
          end
          #puts "def2[#{j}] #{def2[j]}"
          s1 = def1[i].split(" ")
          s1.each do |tok1|
            tok1stem = find_stem_word(tok1, speller)
            s2 = def2[j].split(" ")
            s2.each do |tok2|
              tok2stem = find_stem_word(tok2, speller)
              # puts "tok1 #{tok1} and tok2 #{tok2}"
              # puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}"
              if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and 
                !instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem))
                # puts("**Overlap def/ex:: #{tok1} or #{tok1stem}")
                numOverlap+=1
              end
            end #end of s2 loop
          end #end of s1 loop
        end #end of def2[j][0] being null
      end #end of for loop for def2 - j
    end #end of if def1[i][0] being null
  end #end of for loop for def1 - i
  return numOverlap
end