Class: ExtractPatterns

Inherits:
Object
  • Object
show all
Defined in:
lib/extractpatterns.rb

Instance Method Summary collapse

Constructor Details

#initialize(input, fields, match_name) ⇒ ExtractPatterns

Returns a new instance of ExtractPatterns.



6
7
8
9
10
11
# File 'lib/extractpatterns.rb', line 6

def initialize(input, fields, match_name)
  @input = JSON.parse(fixEncode(input))
  @fields = fields
  @match_name = match_name
  @output = Array.new
end

Instance Method Details

#comma_list_matches(value) ⇒ Object

Split to find matches



14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/extractpatterns.rb', line 14

def comma_list_matches(value)
  if value
    # Split on commas
    list_items = value.split(",")

    # Only get items under certain num of words
    list_items.reject!{ |item| item.split(" ").length > 2 }

    # Clean whitespace and ands
    return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
  end
  return []
end

#find_known_terms(item, field, extract_list) ⇒ Object

Extract set terms



44
45
46
47
48
# File 'lib/extractpatterns.rb', line 44

def find_known_terms(item, field, extract_list)
  d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
  d.extractSetTerms(fixEncode(File.read(extract_list)), ["codeword"], "case_sensitive")
  return JSON.parse(d.getAllOutput).first["extracted_codewords"]
end

#fixEncode(str) ⇒ Object

Fix encoding errors



51
52
53
54
55
56
57
# File 'lib/extractpatterns.rb', line 51

def fixEncode(str)
  if str.is_a?(String)
    return str.unpack('C*').pack('U*')
  else
    return str
  end
end

#get_allcaps(value, length) ⇒ Object

Get words in ALLCAPS past certain length



29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/extractpatterns.rb', line 29

def get_allcaps(value, length)
  if length && value
    # Get all matches
    matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
    
    # Remove matches that are too long
    matches.reject!{|match| match.length > 100}
    
    # Remove whitespace
    return matches.map{ |match| match.strip.lstrip }
  end
  return []
end

#normalize_results(extracted_raw, synonym_list) ⇒ Object

Normalize and match synonyms and deduplicate



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/extractpatterns.rb', line 60

def normalize_results(extracted_raw, synonym_list)
  synonyms = JSON.parse(fixEncode(File.read(synonym_list)))
  outarr = extracted_raw.dup

  # Go through all extracted
  extracted_raw.each do |extracted|
    # Go through each item in synonym list
    synonyms.each do |key, value|
      value["codeword"].each do |word|
        # Match found!
        if word.downcase == extracted.downcase
          outarr.delete(extracted)
          outarr.push(key)
        end
      end
    end
  end

  # Return deduplicated
  return outarr.uniq
end

#ranked_hash_output(results) ⇒ Object

Return a ranked hash of the results



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/extractpatterns.rb', line 107

def ranked_hash_output(results)
  # Make array of all results
  allmatches = Array.new
  results.each do |i|
    i["tools_mentioned"].each do |match|
      allmatches.push(match)
    end
  end

  # Make ranked hash
  rankedhash = Hash.new
  allmatches.each do |match|
    if rankedhash[match]
      rankedhash[match] += 1
    else
      rankedhash[match] = 1
    end
  end
  return rankedhash.sort_by{|k, v| v}
end

#search_fields(allcaps_length, extract_list, merge_field) ⇒ Object

Go through all items in JSON and fields to search



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/extractpatterns.rb', line 83

def search_fields(allcaps_length, extract_list, merge_field)
  # Extract from each item
  @input.each do |item|
    item[@match_name] = Array.new
    
    @fields.each do |field|
      # Extract list results, allcaps, and known codewords from each field
      list_results = comma_list_matches(item[field])
      allcaps_results = get_allcaps(item[field], allcaps_length)
      merge_results = item[merge_field] ? item[merge_field] : []
     # known_terms_results = find_known_terms(fixEncode(item), field, extract_list)
                           
      # Merge results and post-process
      item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results ),extract_list)
    end
    
    # Push updated item out
    @output.push(item)
  end

  return @output
end