Module: Licensee::ContentHelper

Included in:: License, Project::LicenseFile

Defined in:: lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =

Digest::SHA1

END_OF_TERMS_REGEX =

/^\s*end of terms and conditions\s*$/i

ALT_TITLE_REGEX =

{
  'bsd-2-clause'       => /bsd 2-clause( \"simplified\")? license/i,
  'bsd-3-clause'       => /bsd 3-clause( \"new\" or \"revised\")? license/i,
  'bsd-3-clause-clear' => /bsd 3-clause( clear)? license/i
}.freeze

Instance Method Summary collapse

#content_normalized ⇒ Object

Content without title, version, copyright, whitespace, or insturctions.
#content_without_title_and_version ⇒ Object

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
#hash ⇒ Object

SHA1 of the normalized content.
#length ⇒ Object

Number of characteres in the normalized content.
#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length.
#max_delta ⇒ Object

Number of characters that could be added/removed to still be considered a potential match.
#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common.
#wordset ⇒ Object

A set of each word in the license, without duplicates.

Instance Method Details

#content_normalized ⇒ `Object`

Content without title, version, copyright, whitespace, or insturctions

# File 'lib/licensee/content_helper.rb', line 64

def content_normalized
  return unless content
  @content_normalized ||= begin
    string = content_without_title_and_version.downcase
    while string =~ Matchers::Copyright::REGEX
      string = strip_copyright(string)
    end
    string = strip_hrs(string)
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    strip_whitespace(string)
  end
end

#content_without_title_and_version ⇒ `Object`

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File 'lib/licensee/content_helper.rb', line 55

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    string = content.strip
    string = strip_title(string) while string =~ title_regex
    strip_version(string).strip
  end
end

#hash ⇒ `Object`

SHA1 of the normalized content



47
48
49

# File 'lib/licensee/content_helper.rb', line 47

def hash
  @hash ||= DIGEST.hexdigest content_normalized
end

#length ⇒ `Object`

Number of characteres in the normalized content

# File 'lib/licensee/content_helper.rb', line 22

def length
  return 0 unless content_normalized
  content_normalized.length
end

#length_delta(other) ⇒ `Object`

Given another license or project file, calculates the difference in length



34
35
36

# File 'lib/licensee/content_helper.rb', line 34

def length_delta(other)
  (length - other.length).abs
end

#max_delta ⇒ `Object`

Number of characters that could be added/removed to still be considered a potential match



29
30
31

# File 'lib/licensee/content_helper.rb', line 29

def max_delta
  (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ `Object`

Given another license or project file, calculates the similarity as a percentage of words in common

# File 'lib/licensee/content_helper.rb', line 40

def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end

#wordset ⇒ `Object`

A set of each word in the license, without duplicates

# File 'lib/licensee/content_helper.rb', line 15

def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/[\w']+/).to_set
  end
end

Module: Licensee::ContentHelper

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#content_normalized ⇒ Object

#content_without_title_and_version ⇒ Object

#hash ⇒ Object

#length ⇒ Object

#length_delta(other) ⇒ Object

#max_delta ⇒ Object

#similarity(other) ⇒ Object

#wordset ⇒ Object