Class: ExtractPatterns
- Inherits:
-
Object
- Object
- ExtractPatterns
- Defined in:
- lib/extractpatterns.rb
Instance Method Summary collapse
-
#comma_list_matches(value) ⇒ Object
Split to find matches.
-
#find_known_terms(item, field, extract_list) ⇒ Object
Extract set terms.
-
#fixEncode(str) ⇒ Object
Fix encoding errors.
-
#get_allcaps(value, length) ⇒ Object
Get words in ALLCAPS past certain length.
-
#initialize(input, fields, match_name) ⇒ ExtractPatterns
constructor
A new instance of ExtractPatterns.
-
#normalize_results(extracted_raw, synonym_list) ⇒ Object
Normalize and match synonyms and deduplicate.
-
#ranked_hash_output(results) ⇒ Object
Return a ranked hash of the results.
-
#search_fields(allcaps_length, extract_list, merge_field) ⇒ Object
Go through all items in JSON and fields to search.
Constructor Details
#initialize(input, fields, match_name) ⇒ ExtractPatterns
Returns a new instance of ExtractPatterns.
6 7 8 9 10 11 |
# File 'lib/extractpatterns.rb', line 6 def initialize(input, fields, match_name) @input = JSON.parse(input) @fields = fields @match_name = match_name @output = Array.new end |
Instance Method Details
#comma_list_matches(value) ⇒ Object
Split to find matches
14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/extractpatterns.rb', line 14 def comma_list_matches(value) if value # Split on commas list_items = value.split(",") # Only get items under certain num of words list_items.reject!{ |item| item.split(" ").length > 2 } # Clean whitespace and ands return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip } end return [] end |
#find_known_terms(item, field, extract_list) ⇒ Object
Extract set terms
44 45 46 47 48 |
# File 'lib/extractpatterns.rb', line 44 def find_known_terms(item, field, extract_list) d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords") d.extractSetTerms(fixEncode(File.read(extract_list)), ["codeword"], "case_sensitive") return JSON.parse(d.getAllOutput).first["extracted_codewords"] end |
#fixEncode(str) ⇒ Object
Fix encoding errors
51 52 53 54 55 56 57 |
# File 'lib/extractpatterns.rb', line 51 def fixEncode(str) if str.is_a?(String) return str.unpack('C*').pack('U*') else return str end end |
#get_allcaps(value, length) ⇒ Object
Get words in ALLCAPS past certain length
29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/extractpatterns.rb', line 29 def get_allcaps(value, length) if length && value # Get all matches matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/) # Remove matches that are too long matches.reject!{|match| match.length > 100} # Remove whitespace return matches.map{ |match| match.strip.lstrip } end return [] end |
#normalize_results(extracted_raw, synonym_list) ⇒ Object
Normalize and match synonyms and deduplicate
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/extractpatterns.rb', line 60 def normalize_results(extracted_raw, synonym_list) synonyms = JSON.parse(fixEncode(File.read(synonym_list))) outarr = extracted_raw.dup # Go through all extracted extracted_raw.each do |extracted| # Go through each item in synonym list synonyms.each do |key, value| value["codeword"].each do |word| # Match found! if word.downcase == extracted.downcase outarr.delete(extracted) outarr.push(key) end end end end # Return deduplicated return outarr.uniq end |
#ranked_hash_output(results) ⇒ Object
Return a ranked hash of the results
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/extractpatterns.rb', line 107 def ranked_hash_output(results) # Make array of all results allmatches = Array.new results.each do |i| i["tools_mentioned"].each do |match| allmatches.push(match) end end # Make ranked hash rankedhash = Hash.new allmatches.each do |match| if rankedhash[match] rankedhash[match] += 1 else rankedhash[match] = 1 end end return rankedhash.sort_by{|k, v| v} end |
#search_fields(allcaps_length, extract_list, merge_field) ⇒ Object
Go through all items in JSON and fields to search
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/extractpatterns.rb', line 83 def search_fields(allcaps_length, extract_list, merge_field) # Extract from each item @input.each do |item| item[@match_name] = Array.new @fields.each do |field| # Extract list results, allcaps, and known codewords from each field list_results = comma_list_matches(item[field]) allcaps_results = get_allcaps(item[field], allcaps_length) merge_results = item[merge_field] ? item[merge_field] : [] known_terms_results = find_known_terms(item, field, extract_list) # Merge results and post-process item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results ),extract_list) end # Push updated item out @output.push(item) end return @output end |