Class: WordnetBasedSimilarity
- Inherits:
-
Object
- Object
- WordnetBasedSimilarity
- Defined in:
- lib/automated_metareview/wordnet_based_similarity.rb
Instance Attribute Summary collapse
-
#count ⇒ Object
Returns the value of attribute count.
-
#match ⇒ Object
Returns the value of attribute match.
Instance Method Summary collapse
-
#check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type) ⇒ Object
This method compares the submission and reviews’ synonyms and antonyms with each others’ tokens and stem values.
-
#compare_strings(reviewVertex, submVertex, speller) ⇒ Object
@@posTagger = EngTagger.new.
-
#determine_POS(vert) ⇒ Object
determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word.
-
#extract_definition(glosses) ⇒ Object
This method is used to extract definitions for the words (since glossed contain definitions and examples!) glosses - string containing the gloss of the synset.
-
#find_stem_word(word, speller) ⇒ Object
find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!.
-
#get_relations_for_review_submission_tokens(token, stem, pos) ⇒ Object
This method fetches the synonyms, hypernyms, hyponyms and other relations for the ‘token’ and its stem ‘stem’.
-
#is_frequent_word(word) ⇒ Object
is_frequent_word - method checks to see if the given word is a frequent word.
-
#overlap(def1, def2, speller) ⇒ Object
——————————————————————————.
Instance Attribute Details
#count ⇒ Object
Returns the value of attribute count.
5 6 7 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 5 def count @count end |
#match ⇒ Object
Returns the value of attribute match.
5 6 7 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 5 def match @match end |
Instance Method Details
#check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type) ⇒ Object
This method compares the submission and reviews’ synonyms and antonyms with each others’ tokens and stem values.
The instance variables 'match' and 'count' are updated accordingly.
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 323 def check_match(rev_token, subm_token, rev_arr, subm_arr, rev_stem, subm_stem, rev_state, subm_state, match_type, non_match_type) flag = 0 #indicates if a match was found # puts("check_match between: #{rev_token} & #{subm_token} match_type #{match_type} and non_match_type #{non_match_type}") # puts "rev_arr #{rev_arr}" # puts "subm_arr #{subm_arr}" if((!rev_arr.nil? and (rev_arr.include?(subm_token) or rev_arr.include?(subm_stem))) or (!subm_arr.nil? and (subm_arr.include?(rev_token) or subm_arr.include?(rev_stem)))) # puts("Match found between: #{rev_token} & #{subm_token}") flag = 1 #setting the flag to indicate that a match was found if(rev_state == subm_state) @match = @match + match_type elsif(rev_state != subm_state) @match = @match+ non_match_type end @count+=1 end if(flag == 1) return true else return false end end |
#compare_strings(reviewVertex, submVertex, speller) ⇒ Object
@@posTagger = EngTagger.new
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 7 def compare_strings(reviewVertex, submVertex, speller) #must fix this to something that is local to the app # WordNet::WordNetDB.path = "/usr/local/WordNet-3.0" # WordNet::WordNetDB.path = "/usr/local/Cellar/wordNet/3.0" review = reviewVertex.name submission = submVertex.name reviewState = reviewVertex.state submState = submVertex.state # puts("@@@@@@@@@ Comparing Vertices:: #{review} and #{submission} :: RevState:: #{reviewState} and SubmState:: #{submState}"); @match = 0 @count = 0 reviewPOS = "" submPOS = "" #checking for exact matches between the tokens if(review.casecmp(submission) == 0) # and !is_frequent_word(reviewVertex.name) - removing this condition else, it returns a NOMATCH although the frequent words are equal and this negatively impacts the total match value # puts("Review vertex types #{reviewVertex.type} && #{submVertex.type}") if(reviewState.equal?(submState)) @match = @match + EXACT elsif(!reviewState.equal?(submState)) @match = @match + NEGEXACT end return @match end stokRev = review.split(" ") #stokSub = submission.split(" ") #should've been inside when doing n * n comparison #iterating through review tokens for i in (0..stokRev.length-1) #if either of the tokens is null if(stokRev[i].nil?) next #continue with the next token end revToken = stokRev[i].downcase() if(reviewPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v) reviewPOS = determine_POS(reviewVertex).strip end # puts("*** RevToken:: #{revToken} ::Review POS:: #{reviewPOS} class #{reviewPOS.class}") if(revToken.equal?("n't")) revToken = "not" # puts("replacing n't") end #if the review token is a frequent word, continue if(is_frequent_word(revToken)) # puts("Skipping frequent review token .. #{revToken}") next #equivalent of the "continue" end #fetching synonyms, hypernyms, hyponyms etc. for the review token revStem = find_stem_word(revToken, speller) #fetching all the relations review_relations = get_relations_for_review_submission_tokens(revToken, revStem, reviewPOS) #setting the values in specific array variables revGloss = review_relations[0] revSyn =review_relations[1] revHyper = review_relations[2] revHypo = review_relations[3] revAnt = review_relations[4] # puts "reviewStem:: #{revStem} .. #{revStem.class}" # puts "reviewGloss:: #{revGloss} .. #{revGloss.class}" # puts "reviewSynonyms:: #{revSyn} .. #{revSyn.class}" # puts "reviewHypernyms:: #{revHyper} .. #{revHyper.class}" # puts "reviewHyponyms:: #{revHypo} .. #{revHypo.class}" # puts "reviewAntonyms:: #{revAnt} .. #{revAnt.class}" stokSub = submission.split(" ") #iterating through submission tokens for j in (0..stokSub.length-1) if(stokSub[i].nil?) next end subToken = stokSub[j].downcase() if(submPOS.empty?)#do not reset POS for every new token, it changes the POS of the vertex e.g. like has diff POS for vertices "like"(n) and "would like"(v) submPOS = determine_POS(submVertex).strip end # puts("*** SubToken:: #{subToken} ::Review POS:: #{submPOS}") if(subToken.equal?("n't")) subToken = "not" # puts("replacing n't") end #if the review token is a frequent word, continue if(is_frequent_word(subToken)) # puts("Skipping frequent subtoken .. #{subToken}") next #equivalent of the "continue" end #fetching synonyms, hypernyms, hyponyms etc. for the submission token submStem = find_stem_word(subToken, speller) subm_relations = get_relations_for_review_submission_tokens(subToken, submStem, submPOS) submGloss = subm_relations[0] submSyn =subm_relations[1] submHyper = subm_relations[2] submHypo = subm_relations[3] submAnt = subm_relations[4] # puts "submStem:: #{submStem}" # puts "submGloss:: #{submGloss}" # puts "submSynonyms:: #{submSyn}" # puts "submHypernyms:: #{submHyper}" # puts "submHyponyms:: #{submHypo}" # puts "submAntonyms:: #{submAnt}" #------------------------------------------ #checks are ordered from BEST to LEAST degree of semantic relatedness #*****exact matches # puts "@match #{@match} reviewState #{reviewState} submState #{submState} reviewPOS #{reviewPOS} submPOS #{submPOS}" # puts "reviewState.equal?(submState) #{reviewState.equal?(submState)}" # puts "reviewPOS.equal?(submPOS) #{reviewPOS == submPOS}" if(subToken.casecmp(revToken) == 0 or submStem.casecmp(revStem) == 0) #EXACT MATCH (submission.toLowerCase().equals(review.toLowerCase())) # puts("exact match for #{revToken} & #{subToken} or #{submStem} and #{revStem}") if(reviewState.equal?(submState)) @match = @match + EXACT elsif(!reviewState.equal?(submState)) @match = @match + NEGEXACT end @count+=1 next #skip all remaining checks end #end of if condition checking for exact matches #------------------------------------------ #*****For Synonyms #if the method returns 'true' it indicates a synonym match of some kind was found and the remaining checks can be skipped if(check_match(revToken, subToken, revSyn, submSyn, revStem, submStem, reviewState, submState, SYNONYM, ANTONYM)) next end #------------------------------------------ #ANTONYMS if(check_match(revToken, subToken, revAnt, submAnt, revStem, submStem, reviewState, submState, ANTONYM, SYNONYM)) next end #------------------------------------------ #*****For Hypernyms if(check_match(revToken, subToken, revHyper, submHyper, revStem, submStem, reviewState, submState, HYPERNYM, NEGHYPERNYM)) next end #------------------------------------------ #*****For Hyponyms if(check_match(revToken, subToken, revHypo, submHypo, revStem, submStem, reviewState, submState, HYPONYM, NEGHYPONYM)) next end #overlap across definitions # checking if overlaps exist across review and submission tokens' defintions or if either defintiions contains the review # or submission token or stem. # puts "#{extract_definition(revGloss)[0]} .. extract_definition(revGloss)[0] #{extract_definition(revGloss)[0][0].class}" # puts "!revGloss #{!revGloss} .. revGloss.class #{revGloss.class}.. revGloss[0].include?(subToken) #{revGloss[0].include?(subToken)}" # rev_def = extract_definition(revGloss) # sub_def = extract_definition(submGloss) #(!revGloss.nil? and !submGloss.nil? and overlap(revGloss, submGloss, speller) > 0) or if((!revGloss.nil? and !revGloss[0].nil? and !subToken.nil? and !submStem.nil? and (revGloss[0].include?(subToken) or revGloss[0].include?(submStem))) or (!submGloss.nil? and !submGloss[0].nil? and !revToken.nil? and !revStem.nil? and (submGloss[0].include?(revToken) or submGloss[0].include?(revStem)))) if(reviewState == submState) @match = @match + OVERLAPDEFIN elsif(reviewState != submState) @match = @match + NEGOVERLAPDEFIN end @count+=1 next end #no match found! # puts "No Match found!" @match = @match + NOMATCH @count+=1 end #end of the for loop for submission tokens end #end of the for loop for review tokens if(@count > 0) # puts ("Match: #{@match} Count:: #{@count}") result = (@match.to_f/@count.to_f).round # puts("@@@@@@@@@ Returning Value: #{result}") return result #an average of the matches found end # puts("@@@@@@@@@ Returning NOMATCH") return NOMATCH end |
#determine_POS(vert) ⇒ Object
determine_POS - method helps identify the POS tag (for the wordnet lexicon) for a certain word
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 351 def determine_POS(vert) str_pos = vert.pos_tag # puts("Inside determine_POS POS Tag:: #{str_pos}") if(str_pos.include?("CD") or str_pos.include?("NN") or str_pos.include?("PR") or str_pos.include?("IN") or str_pos.include?("EX") or str_pos.include?("WP")) pos = "n"#WordNet::Noun elsif(str_pos.include?("JJ")) pos = "a" #WordNet::Adjective elsif(str_pos.include?("TO") or str_pos.include?("VB") or str_pos.include?("MD")) pos = "v" #WordNet::Verb elsif(str_pos.include?("RB")) pos = "r" #WordNet::Adverb else pos = "n" #WordNet::Noun end return pos end |
#extract_definition(glosses) ⇒ Object
This method is used to extract definitions for the words (since glossed contain definitions and examples!)
glosses - string containing the gloss of the synset
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 415 def extract_definition(glosses) definitions = ""#[] #extracting examples from definitions temp = glosses tempList = temp.split(";") for i in 0..tempList.length - 1 if(!tempList[i].include?('"')) if(definitions.empty?) definitions = tempList[i] else definitions = definitions +" "+ tempList[i] end end end #puts definitions return definitions end |
#find_stem_word(word, speller) ⇒ Object
find_stem_word - stems the word and checks if the word is correctly spelt, else it will return a correctly spelled word as suggested by spellcheck
It generated the nearest stem, since no context information is involved, the quality of the stems may not be great!
394 395 396 397 398 399 400 401 402 403 404 405 406 407 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 394 def find_stem_word(word, speller) stem = word.stem correct = stem #initializing correct to the stem word #checkiing the stem word's spelling for correctness while(!speller.check(correct)) do if(!speller.suggest(correct).first.nil?) correct = speller.suggest(correct).first else #break out of the loop, else it will continue infinitely break #break out of the loop if the first correction was nil end end return correct end |
#get_relations_for_review_submission_tokens(token, stem, pos) ⇒ Object
This method fetches the synonyms, hypernyms, hyponyms and other relations for the ‘token’ and its stem ‘stem’.
This is done for both review and submission tokens/stems.
It returns a double dimensional array, where each element is an array of synonyms, hypernyms etc.
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 200 def get_relations_for_review_submission_tokens(token, stem, pos) # puts "@@@@ Inside get_relations_for_review_submission_tokens" relations = Array.new lemmas = WordNet::WordNetDB.find(token) if(lemmas.nil?) lemmas = WordNet::WordNetDB.find(stem) end #select the lemma corresponding to the token's POS lemma = "" lemmas.each do |l| # puts "lemma's POS :: #{l.pos} and POS :: #{pos}" if(l.pos == pos) lemma = l break end end def_arr = Array.new syn_arr = Array.new hyper_arr = Array.new hypo_arr = Array.new anto_arr = Array.new #if selected reviewLemma is not nil or empty if(!lemma.nil? and lemma != "" and !lemma.synsets.nil?) #creating arrays of all the values for synonyms, hyponyms etc. for the review token for g in 0..lemma.synsets.length - 1 #fetching the first review synset lemma_synset = lemma.synsets[g] #definitions if(!lemma_synset.gloss.nil?) #puts "lemma_synset.gloss.class #{lemma_synset.gloss.class}" if(def_arr[0].nil?) def_arr << extract_definition(lemma_synset.gloss) else def_arr[0] = def_arr[0] + " " + extract_definition(lemma_synset.gloss) end else def_arr << nil end #looking for all relations synonym, hypernym, hyponym etc. from among this synset #synonyms begin #error handling for lemmas's without synsets that throw errors! (likely due to the dictionary file we are using) lemmaSyns = lemma_synset.get_relation("&") if(!lemmaSyns.nil? and lemmaSyns.length != 0) # puts "lemmaSyns.length #{lemmaSyns.length}" #for each synset get the values and add them to the array for h in 0..lemmaSyns.length - 1 # puts "lemmaSyns[h].words.class #{lemmaSyns[h].words.class}" syn_arr = syn_arr + lemmaSyns[h].words # puts "**** syn_arr #{syn_arr}" end else syn_arr << nil #setting nil when no synset match is found for a particular type of relation end rescue syn_arr << nil end #hypernyms begin lemmaHypers = lemma_synset.get_relation("@")#hypernym.words if(!lemmaHypers.nil? and lemmaHypers.length != 0) #for each synset get the values and add them to the array for h in 0..lemmaHypers.length - 1 #puts "lemmaHypers[h].words.class #{lemmaHypers[h].words.class}" hyper_arr = hyper_arr + lemmaHypers[h].words end else hyper_arr << nil end rescue hyper_arr << nil end #hyponyms begin lemmaHypos = lemma_synset.get_relation("~")#hyponym if(!lemmaHypos.nil? and lemmaHypos.length != 0) #for each synset get the values and add them to the array for h in 0..lemmaHypos.length - 1 hypo_arr = hypo_arr + lemmaHypos[h].words end else hypo_arr << nil end rescue hypo_arr << nil end #antonyms begin lemmaAnts = lemma_synset.get_relation("!") if(!lemmaAnts.nil? and lemmaAnts.length != 0) #for each synset get the values and add them to the array for h in 0..lemmaAnts.length - 1 anto_arr = anto_arr + lemmaAnts[h].words end else anto_arr << nil end rescue anto_arr << nil end end #end of the for loop for g end #end of checking if the lemma is nil or empty #setting the array elements before returning the array relations << def_arr relations << syn_arr relations << hyper_arr relations << hypo_arr relations << anto_arr return relations end |
#is_frequent_word(word) ⇒ Object
is_frequent_word - method checks to see if the given word is a frequent word
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 372 def is_frequent_word(word) word.gsub!("(", "") #gsub replaces all occurrences of "(" and the exclamation point helps to do in-place substitution word.gsub!(")", "") #if the character doesn't exist, the function returns nil, which does not affect the existing variable word.gsub!("[", "") word.gsub!("]", "") word.gsub!("\"", "") if(FREQUENT_WORDS.include?(word)) return true end if(CLOSED_CLASS_WORDS.include?(word)) return true end return false end |
#overlap(def1, def2, speller) ⇒ Object
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 |
# File 'lib/automated_metareview/wordnet_based_similarity.rb', line 434 def overlap(def1, def2, speller) instance = WordnetBasedSimilarity.new numOverlap = 0 #only overlaps across the ALL definitions # puts "def1 #{def1}" # puts "def2 #{def2}" #iterating through def1's definitions for i in 0..def1.length-1 if(!def1[i].nil?) #puts "def1[#{i}] #{def1[i]}" if( def1[i].include?("\"")) def1[i].gsub!("\"", " ") end if(def1[i].include?(";")) def1[i] = def1[i][0..def1[i].index(";")] end #iterating through def2's definitions for j in 0..def2.length - 1 if(!def2[j].nil?) if(def2[j].include?(";")) def2[j] = def2[j][0..def2[j].index(";")] end #puts "def2[#{j}] #{def2[j]}" s1 = def1[i].split(" ") s1.each do |tok1| tok1stem = find_stem_word(tok1, speller) s2 = def2[j].split(" ") s2.each do |tok2| tok2stem = find_stem_word(tok2, speller) # puts "tok1 #{tok1} and tok2 #{tok2}" # puts "tok1stem #{tok1stem} and tok2stem #{tok2stem}" if((tok1.downcase == tok2.downcase or tok1stem.downcase == tok2stem.downcase) and !instance.is_frequent_word(tok1) and !instance.is_frequent_word(tok1stem)) # puts("**Overlap def/ex:: #{tok1} or #{tok1stem}") numOverlap+=1 end end #end of s2 loop end #end of s1 loop end #end of def2[j][0] being null end #end of for loop for def2 - j end #end of if def1[i][0] being null end #end of for loop for def1 - i return numOverlap end |