Module: Licensee::ContentHelper

Included in:: License, ProjectFiles::LicenseFile

Defined in:: lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =

Digest::SHA1

END_OF_TERMS_REGEX =

/^[\s#*_]*end of terms and conditions\s*$/i

HR_REGEX =

/[=\-\*][=\-\*\s]{3,}/

ALT_TITLE_REGEX =

License::ALT_TITLE_REGEX

ALL_RIGHTS_RESERVED_REGEX =

/\Aall rights reserved\.?$/i

WHITESPACE_REGEX =

/\s+/

MARKDOWN_HEADING_REGEX =

/\A\s*#+/

VERSION_REGEX =

/\Aversion.*$/i

MARKUP_REGEX =

/[#_*=~\[\]()`|>]+/

DEVELOPED_BY_REGEX =

/\Adeveloped by:.*?\n\n/im

QUOTE_BEGIN_REGEX =

/[`'"‘“]/

QUOTE_END_REGEX =

/['"’”]/

Class Method Summary collapse

.format_percent(float) ⇒ Object
.title_regex ⇒ Object
.wrap(text, line_width = 80) ⇒ Object

Wrap text to the given line length.

Instance Method Summary collapse

#content_hash ⇒ Object

SHA1 of the normalized content.
#content_normalized(wrap: nil) ⇒ Object

Content without title, version, copyright, whitespace, or insturctions.
#content_without_title_and_version ⇒ Object

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
#length ⇒ Object

Number of characteres in the normalized content.
#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length.
#max_delta ⇒ Object

Number of characters that could be added/removed to still be considered a potential match.
#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common.
#wordset ⇒ Object

A set of each word in the license, without duplicates.

Class Method Details

.format_percent(float) ⇒ `Object`



116
117
118

# File 'lib/licensee/content_helper.rb', line 116

def self.format_percent(float)
  "#{format('%.2f', float)}%"
end

.title_regex ⇒ `Object`

# File 'lib/licensee/content_helper.rb', line 120

def self.title_regex
  licenses = Licensee::License.all(hidden: true, psuedo: false)
  titles = licenses.map(&:title_regex)

  # Title regex must include the version to support matching within
  # families, but for sake of normalization, we can be less strict
  without_versions = licenses.map do |license|
    next if license.title == license.name_without_version
    Regexp.new Regexp.escape(license.name_without_version), 'i'
  end
  titles.concat(without_versions.compact)

  /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
end

.wrap(text, line_width = 80) ⇒ `Object`

Wrap text to the given line length

# File 'lib/licensee/content_helper.rb', line 100

def self.wrap(text, line_width = 80)
  return if text.nil?
  text = text.clone
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

  text = text.split("\n").collect do |line|
    if line.length > line_width
      line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
    else
      line
    end
  end * "\n"

  text.strip
end

Instance Method Details

#content_hash ⇒ `Object`

SHA1 of the normalized content



52
53
54

# File 'lib/licensee/content_helper.rb', line 52

def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end

#content_normalized(wrap: nil) ⇒ `Object`

Content without title, version, copyright, whitespace, or insturctions

wrap - Optional width to wrap the content

Returns a string

# File 'lib/licensee/content_helper.rb', line 75

def content_normalized(wrap: nil)
  return unless content
  @content_normalized ||= begin
    string = content_without_title_and_version.downcase
    while string =~ Matchers::Copyright::REGEX
      string = strip_copyright(string)
    end
    string = strip_all_rights_reserved(string)
    string = strip_developed_by(string)
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    string = normalize_lists(string)
    string = normalize_quotes(string)
    string = normalize_https(string)
    string = strip_markup(string)
    strip_whitespace(string)
  end

  if wrap.nil?
    @content_normalized
  else
    Licensee::ContentHelper.wrap(@content_normalized, wrap)
  end
end

#content_without_title_and_version ⇒ `Object`

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File 'lib/licensee/content_helper.rb', line 60

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    string = content.strip
    string = strip_markdown_headings(string)
    string = strip_hrs(string)
    string = strip_title(string) while string =~ ContentHelper.title_regex
    strip_version(string).strip
  end
end

#length ⇒ `Object`

Number of characteres in the normalized content

# File 'lib/licensee/content_helper.rb', line 27

def length
  return 0 unless content_normalized
  content_normalized.length
end

#length_delta(other) ⇒ `Object`

Given another license or project file, calculates the difference in length



39
40
41

# File 'lib/licensee/content_helper.rb', line 39

def length_delta(other)
  (length - other.length).abs
end

#max_delta ⇒ `Object`

Number of characters that could be added/removed to still be considered a potential match



34
35
36

# File 'lib/licensee/content_helper.rb', line 34

def max_delta
  @max_delta ||= (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ `Object`

Given another license or project file, calculates the similarity as a percentage of words in common

# File 'lib/licensee/content_helper.rb', line 45

def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end

#wordset ⇒ `Object`

A set of each word in the license, without duplicates

# File 'lib/licensee/content_helper.rb', line 20

def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
  end
end

Module: Licensee::ContentHelper

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.format_percent(float) ⇒ Object

.title_regex ⇒ Object

.wrap(text, line_width = 80) ⇒ Object

Instance Method Details

#content_hash ⇒ Object

#content_normalized(wrap: nil) ⇒ Object

#content_without_title_and_version ⇒ Object

#length ⇒ Object

#length_delta(other) ⇒ Object

#max_delta ⇒ Object

#similarity(other) ⇒ Object

#wordset ⇒ Object