Module: Licensee::ContentHelper

Included in:
License, Project::LicenseFile
Defined in:
lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =
Digest::SHA1
END_OF_TERMS_REGEX =
/^\s*end of terms and conditions\s*$/i

Instance Method Summary collapse

Instance Method Details

#content_normalizedObject

Content with copyright header and linebreaks removed



47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/licensee/content_helper.rb', line 47

def content_normalized
  return unless content
  @content_normalized ||= begin
    string = content.downcase.strip
    string = strip_title(string) while string =~ title_regex
    string = strip_version(string)
    string = strip_copyright(string)
    string = strip_hrs(string)
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    strip_whitespace(string)
  end
end

#hashObject

SHA1 of the normalized content



42
43
44
# File 'lib/licensee/content_helper.rb', line 42

def hash
  @hash ||= DIGEST.hexdigest content_normalized
end

#lengthObject

Number of characteres in the normalized content



17
18
19
20
# File 'lib/licensee/content_helper.rb', line 17

def length
  return 0 unless content_normalized
  content_normalized.length
end

#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length



29
30
31
# File 'lib/licensee/content_helper.rb', line 29

def length_delta(other)
  (length - other.length).abs
end

#max_deltaObject

Number of characters that could be added/removed to still be considered a potential match



24
25
26
# File 'lib/licensee/content_helper.rb', line 24

def max_delta
  (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common



35
36
37
38
39
# File 'lib/licensee/content_helper.rb', line 35

def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end

#wordsetObject

A set of each word in the license, without duplicates



10
11
12
13
14
# File 'lib/licensee/content_helper.rb', line 10

def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/[\w']+/).to_set
  end
end