Module: Licensee::ContentHelper

Included in:: License, ProjectFiles::LicenseFile

Defined in:: lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =

Digest::SHA1

START_REGEX =

/\A\s*/.freeze

END_OF_TERMS_REGEX =

/^[\s#*_]*end of terms and conditions\s*$/i.freeze

REGEXES =

{
  hrs:                 /^\s*[=\-\*]{3,}\s*$/,
  all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
  whitespace:          /\s+/,
  markdown_headings:   /#{START_REGEX}#+/,
  version:             /#{START_REGEX}version.*$/i,
  span_markup:         /[_*~]+(.*?)[_*~]+/,
  link_markup:         /\[(.+?)\]\(.+?\)/,
  block_markup:        /^\s*>/,
  border_markup:       /^[\*-](.*?)[\*-]$/,
  comment_markup:      %r{^\s*?[/\*]{1,2}},
  url:                 %r{#{START_REGEX}https?://[^ ]+\n},
  bullet:              /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
  developed_by:        /#{START_REGEX}developed by:.*?\n\n/im,
  quote_begin:         /[`'"‘“]/,
  quote_end:           /[`'"’”]/,
  cc_legal_code:       /^\s*Creative Commons Legal Code\s*$/i,
  cc0_info:            /For more information, please see\s*\S+zero\S+/im,
  cc0_disclaimer:      /CREATIVE COMMONS CORPORATION.*?\n\n/im,
  unlicense_info:      /For more information, please.*\S+unlicense\S+/im,
  mit_optional:        /\(including the next paragraph\)/i
}.freeze

NORMALIZATIONS =

{
  lists:      { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
  https:      { from: /http:/, to: 'https:' },
  ampersands: { from: '&', to: 'and' },
  dashes:     { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
  quotes:     {
    from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
    to:   '"\1"'
  }
}.freeze

VARIETAL_WORDS = Legally equivalent words that schould be ignored for comparison See spdx.org/spdx-license-list/matching-guidelines

{
  'acknowledgment'  => 'acknowledgement',
  'analogue'        => 'analog',
  'analyse'         => 'analyze',
  'artefact'        => 'artifact',
  'authorisation'   => 'authorization',
  'authorised'      => 'authorized',
  'calibre'         => 'caliber',
  'cancelled'       => 'canceled',
  'capitalisations' => 'capitalizations',
  'catalogue'       => 'catalog',
  'categorise'      => 'categorize',
  'centre'          => 'center',
  'emphasised'      => 'emphasized',
  'favour'          => 'favor',
  'favourite'       => 'favorite',
  'fulfil'          => 'fulfill',
  'fulfilment'      => 'fulfillment',
  'initialise'      => 'initialize',
  'judgment'        => 'judgement',
  'labelling'       => 'labeling',
  'labour'          => 'labor',
  'licence'         => 'license',
  'maximise'        => 'maximize',
  'modelled'        => 'modeled',
  'modelling'       => 'modeling',
  'offence'         => 'offense',
  'optimise'        => 'optimize',
  'organisation'    => 'organization',
  'organise'        => 'organize',
  'practise'        => 'practice',
  'programme'       => 'program',
  'realise'         => 'realize',
  'recognise'       => 'recognize',
  'signalling'      => 'signaling',
  'sub-license'     => 'sublicense',
  'sub license'     => 'sublicense',
  'utilisation'     => 'utilization',
  'whilst'          => 'while',
  'wilful'          => 'wilfull',
  'non-commercial'  => 'noncommercial',
  'cent'            => 'percent',
  'owner'           => 'holder'
}.freeze

STRIP_METHODS =

%i[
  cc0_optional
  unlicense_optional
  hrs
  markdown_headings
  borders
  title
  version
  url
  copyright
  title
  block_markup
  span_markup
  link_markup
  all_rights_reserved
  developed_by
  end_of_terms
  whitespace
  mit_optional
].freeze

Class Method Summary collapse

.const_missing(const) ⇒ Object

Backwards compatibalize constants to avoid a breaking change.
.format_percent(float) ⇒ Object
.title_regex ⇒ Object
.wrap(text, line_width = 80) ⇒ Object

Wrap text to the given line length.

Instance Method Summary collapse

#content_hash ⇒ Object

SHA1 of the normalized content.
#content_normalized(wrap: nil) ⇒ Object
#content_without_title_and_version ⇒ Object

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
#length ⇒ Object

Number of characteres in the normalized content.
#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length.
#max_delta ⇒ Object

Number of characters that could be added/removed to still be considered a potential match.
#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common.
#wordset ⇒ Object

A set of each word in the license, without duplicates.

Class Method Details

.const_missing(const) ⇒ `Object`

Backwards compatibalize constants to avoid a breaking change

# File 'lib/licensee/content_helper.rb', line 180

def self.const_missing(const)
  key = const.to_s.downcase.gsub('_regex', '').to_sym
  REGEXES[key] || super
end

.format_percent(float) ⇒ `Object`



206
207
208

# File 'lib/licensee/content_helper.rb', line 206

def self.format_percent(float)
  "#{format('%<float>.2f', float: float)}%"
end

.title_regex ⇒ `Object`

# File 'lib/licensee/content_helper.rb', line 210

def self.title_regex
  @title_regex ||= begin
    licenses = Licensee::License.all(hidden: true, psuedo: false)
    titles = licenses.map(&:title_regex)

    # Title regex must include the version to support matching within
    # families, but for sake of normalization, we can be less strict
    without_versions = licenses.map do |license|
      next if license.title == license.name_without_version

      Regexp.new Regexp.escape(license.name_without_version), 'i'
    end
    titles.concat(without_versions.compact)

    /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
  end
end

.wrap(text, line_width = 80) ⇒ `Object`

Wrap text to the given line length

# File 'lib/licensee/content_helper.rb', line 186

def self.wrap(text, line_width = 80)
  return if text.nil?

  text = text.clone
  text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

  text = text.split("\n").collect do |line|
    if line =~ REGEXES[:hrs]
      line
    elsif line.length > line_width
      line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
    else
      line
    end
  end * "\n"

  text.strip
end

Instance Method Details

#content_hash ⇒ `Object`

SHA1 of the normalized content



145
146
147

# File 'lib/licensee/content_helper.rb', line 145

def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end

#content_normalized(wrap: nil) ⇒ `Object`

# File 'lib/licensee/content_helper.rb', line 162

def content_normalized(wrap: nil)
  @content_normalized ||= begin
    @_content = content_without_title_and_version.downcase

    (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) }
    STRIP_METHODS.each { |op| strip(op) }

    _content
  end

  if wrap.nil?
    @content_normalized
  else
    Licensee::ContentHelper.wrap(@content_normalized, wrap)
  end
end

#content_without_title_and_version ⇒ `Object`

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File 'lib/licensee/content_helper.rb', line 153

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    @_content = nil
    ops = %i[html hrs comments markdown_headings title version]
    ops.each { |op| strip(op) }
    _content
  end
end

#length ⇒ `Object`

Number of characteres in the normalized content

# File 'lib/licensee/content_helper.rb', line 117

def length
  return 0 unless content_normalized

  content_normalized.length
end

#length_delta(other) ⇒ `Object`

Given another license or project file, calculates the difference in length



131
132
133

# File 'lib/licensee/content_helper.rb', line 131

def length_delta(other)
  (length - other.length).abs
end

#max_delta ⇒ `Object`

Number of characters that could be added/removed to still be considered a potential match

# File 'lib/licensee/content_helper.rb', line 125

def max_delta
  @max_delta ||= fields_normalized.size * 10 +
                 (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ `Object`

Given another license or project file, calculates the similarity as a percentage of words in common

# File 'lib/licensee/content_helper.rb', line 137

def similarity(other)
  overlap = (wordset_fieldless & other.wordset).size
  total = wordset_fieldless.size + other.wordset.size -
          fields_normalized_set.size
  100.0 * (overlap * 2.0 / total)
end

#wordset ⇒ `Object`

A set of each word in the license, without duplicates



112
113
114

# File 'lib/licensee/content_helper.rb', line 112

def wordset
  @wordset ||= content_normalized&.scan(/(?:\w(?:'s|(?<=s)')?)+/)&.to_set
end

Module: Licensee::ContentHelper

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.const_missing(const) ⇒ Object

.format_percent(float) ⇒ Object

.title_regex ⇒ Object

.wrap(text, line_width = 80) ⇒ Object

Instance Method Details

#content_hash ⇒ Object

#content_normalized(wrap: nil) ⇒ Object

#content_without_title_and_version ⇒ Object

#length ⇒ Object

#length_delta(other) ⇒ Object

#max_delta ⇒ Object

#similarity(other) ⇒ Object

#wordset ⇒ Object