Class: LicenseMatcher::TFRubyMatcher

Inherits:
Object
  • Object
show all
Includes:
Preprocess
Defined in:
lib/license_matcher/tf_ruby_matcher.rb

Constant Summary collapse

DEFAULT_INDEX_PATH =
'data/index.msgpack'
DEFAULT_MIN_CONFIDENCE =
0.9
A_DOC_ROW =

a array index to find the rows of indexed documents

3

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Preprocess

#clean_html, #parse_html, #preprocess_html, #preprocess_text, #safe_encode

Constructor Details

#initialize(index_path = DEFAULT_INDEX_PATH) ⇒ TFRubyMatcher

Returns a new instance of TFRubyMatcher.



16
17
18
19
20
21
22
23
24
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 16

def initialize(index_path = DEFAULT_INDEX_PATH)
  spdx_ids, spdx_docs = read_corpus(index_path)

  @spdx_ids = spdx_ids
  @corpus = spdx_docs
  @model = TfIdfSimilarity::BM25Model.new(@corpus, :library => :narray)

  true
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



10
11
12
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 10

def corpus
  @corpus
end

#modelObject (readonly)

Returns the value of attribute model.



10
11
12
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 10

def model
  @model
end

#spdx_idsObject (readonly)

Returns the value of attribute spdx_ids.



10
11
12
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 10

def spdx_ids
  @spdx_ids
end

Instance Method Details

#cos_sim(mat1, mat2) ⇒ Object

Calculates cosine similarity between 2 TF-IDF vector



76
77
78
79
80
81
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 76

def cos_sim(mat1, mat2)
  length = (mat1 * mat2).sum
  norm   = Math::sqrt((mat1 ** 2).sum) * Math::sqrt((mat2 ** 2).sum)

  ( norm > 0 ? length / norm : 0.0)
end

#doc_tfidf_matrix(doc) ⇒ Object

Transforms document into TF-IDF matrix used for comparition



60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 60

def doc_tfidf_matrix(doc)
  arr = Array.new(@model.terms.size) do |i|
    the_term = @model.terms[i]
    if doc.term_count(the_term) > 0
      #calc score only for words that exists in the test doc and the corpus of licenses
      model.idf(the_term) * model.tf(doc, the_term)
    else
      0.0
    end
  end

  NArray[*arr]
end

#match_html(html_text, min_confidence = DEFAULT_MIN_CONFIDENCE) ⇒ Object



54
55
56
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 54

def match_html(html_text, min_confidence = DEFAULT_MIN_CONFIDENCE)
  match_text(preprocess_html(html_text), min_confidence)
end

#match_text(text, min_confidence = DEFAULT_MIN_CONFIDENCE, is_processed_text = false) ⇒ Object

matches given text with SPDX licenses and returns Match object returns:

match - Match {label: String, score: float}


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 29

def match_text(text, min_confidence = DEFAULT_MIN_CONFIDENCE, is_processed_text = false)
  return [] if text.to_s.empty?

  text = preprocess_text(text) if is_processed_text == false
  test_doc   = TfIdfSimilarity::Document.new(text, {:id => "test"})

  mat1 = @model.instance_variable_get(:@matrix)
  mat2 = doc_tfidf_matrix(test_doc)

  n_docs = @model.documents.size
  dists = []
  n_docs.times do |i|
    dists << [i, cos_sim(mat1[i, true], mat2)]
  end

  doc_id, best_score = dists.sort {|a,b| b[1] <=> a[1]}.first
  best_match = @model.documents[doc_id].id

  if best_score.to_f > min_confidence
    Match.new(best_match, best_score)
  else
    Match.new("", 0.0)
  end
end

#read_corpus(index_path) ⇒ Object

Reads the content of licenses from the pre-built index NB! it is sensitive to the changes in the Fosslim/Index serialization



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/license_matcher/tf_ruby_matcher.rb', line 85

def read_corpus(index_path)
  idx = MessagePack.unpack File.read index_path
  spdx_ids = []
  docs = []

  idx[A_DOC_ROW].to_a.each do |doc_row|
    _, spdx_id, content, _ = doc_row
    txt = preprocess_text content
    if txt
      spdx_ids << spdx_id
      docs << TfIdfSimilarity::Document.new(txt, :id => spdx_id)
    end
  end

  [spdx_ids, docs]
end