Module: Licensee::ContentHelper

Included in:
License, Project::LicenseFile
Defined in:
lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =
Digest::SHA1
END_OF_TERMS_REGEX =
/^\s*end of terms and conditions\s*$/i
ALT_TITLE_REGEX =
{
  'bsd-2-clause'       => /bsd 2-clause( \"simplified\")? license/i,
  'bsd-3-clause'       => /bsd 3-clause( \"new\" or \"revised\")? license/i,
  'bsd-3-clause-clear' => /bsd 3-clause( clear)? license/i
}.freeze

Instance Method Summary collapse

Instance Method Details

#content_normalizedObject

Content without title, version, copyright, whitespace, or insturctions



64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/licensee/content_helper.rb', line 64

def content_normalized
  return unless content
  @content_normalized ||= begin
    string = content_without_title_and_version.downcase
    while string =~ Matchers::Copyright::REGEX
      string = strip_copyright(string)
    end
    string = strip_hrs(string)
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    strip_whitespace(string)
  end
end

#content_without_title_and_versionObject

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile



55
56
57
58
59
60
61
# File 'lib/licensee/content_helper.rb', line 55

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    string = content.strip
    string = strip_title(string) while string =~ title_regex
    strip_version(string).strip
  end
end

#hashObject

SHA1 of the normalized content



47
48
49
# File 'lib/licensee/content_helper.rb', line 47

def hash
  @hash ||= DIGEST.hexdigest content_normalized
end

#lengthObject

Number of characteres in the normalized content



22
23
24
25
# File 'lib/licensee/content_helper.rb', line 22

def length
  return 0 unless content_normalized
  content_normalized.length
end

#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length



34
35
36
# File 'lib/licensee/content_helper.rb', line 34

def length_delta(other)
  (length - other.length).abs
end

#max_deltaObject

Number of characters that could be added/removed to still be considered a potential match



29
30
31
# File 'lib/licensee/content_helper.rb', line 29

def max_delta
  (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common



40
41
42
43
44
# File 'lib/licensee/content_helper.rb', line 40

def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end

#wordsetObject

A set of each word in the license, without duplicates



15
16
17
18
19
# File 'lib/licensee/content_helper.rb', line 15

def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/[\w']+/).to_set
  end
end