Module: Licensee::ContentHelper

Included in:
License, ProjectFiles::LicenseFile
Defined in:
lib/licensee/content_helper.rb

Constant Summary collapse

DIGEST =
Digest::SHA1
END_OF_TERMS_REGEX =
/^[\s#*_]*end of terms and conditions\s*$/i
HR_REGEX =
/[=\-\*][=\-\*\s]{3,}/
ALT_TITLE_REGEX =
License::ALT_TITLE_REGEX
ALL_RIGHTS_RESERVED_REGEX =
/\Aall rights reserved\.?$/i
WHITESPACE_REGEX =
/\s+/
MARKDOWN_HEADING_REGEX =
/\A\s*#+/
VERSION_REGEX =
/\Aversion.*$/i
MARKUP_REGEX =
/[#_*=~\[\]()`|>]+/
DEVELOPED_BY_REGEX =
/\Adeveloped by:.*?\n\n/im
QUOTE_BEGIN_REGEX =
/[`'"‘“]/
QUOTE_END_REGEX =
/['"’”]/

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.format_percent(float) ⇒ Object



116
117
118
# File 'lib/licensee/content_helper.rb', line 116

def self.format_percent(float)
  "#{format('%.2f', float)}%"
end

.title_regexObject



120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/licensee/content_helper.rb', line 120

def self.title_regex
  licenses = Licensee::License.all(hidden: true, psuedo: false)
  titles = licenses.map(&:title_regex)

  # Title regex must include the version to support matching within
  # families, but for sake of normalization, we can be less strict
  without_versions = licenses.map do |license|
    next if license.title == license.name_without_version
    Regexp.new Regexp.escape(license.name_without_version), 'i'
  end
  titles.concat(without_versions.compact)

  /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
end

.wrap(text, line_width = 80) ⇒ Object

Wrap text to the given line length



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/licensee/content_helper.rb', line 100

def self.wrap(text, line_width = 80)
  return if text.nil?
  text = text.clone
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

  text = text.split("\n").collect do |line|
    if line.length > line_width
      line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
    else
      line
    end
  end * "\n"

  text.strip
end

Instance Method Details

#content_hashObject

SHA1 of the normalized content



52
53
54
# File 'lib/licensee/content_helper.rb', line 52

def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end

#content_normalized(wrap: nil) ⇒ Object

Content without title, version, copyright, whitespace, or insturctions

wrap - Optional width to wrap the content

Returns a string



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/licensee/content_helper.rb', line 75

def content_normalized(wrap: nil)
  return unless content
  @content_normalized ||= begin
    string = content_without_title_and_version.downcase
    while string =~ Matchers::Copyright::REGEX
      string = strip_copyright(string)
    end
    string = strip_all_rights_reserved(string)
    string = strip_developed_by(string)
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    string = normalize_lists(string)
    string = normalize_quotes(string)
    string = normalize_https(string)
    string = strip_markup(string)
    strip_whitespace(string)
  end

  if wrap.nil?
    @content_normalized
  else
    Licensee::ContentHelper.wrap(@content_normalized, wrap)
  end
end

#content_without_title_and_versionObject

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile



60
61
62
63
64
65
66
67
68
# File 'lib/licensee/content_helper.rb', line 60

def content_without_title_and_version
  @content_without_title_and_version ||= begin
    string = content.strip
    string = strip_markdown_headings(string)
    string = strip_hrs(string)
    string = strip_title(string) while string =~ ContentHelper.title_regex
    strip_version(string).strip
  end
end

#lengthObject

Number of characteres in the normalized content



27
28
29
30
# File 'lib/licensee/content_helper.rb', line 27

def length
  return 0 unless content_normalized
  content_normalized.length
end

#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length



39
40
41
# File 'lib/licensee/content_helper.rb', line 39

def length_delta(other)
  (length - other.length).abs
end

#max_deltaObject

Number of characters that could be added/removed to still be considered a potential match



34
35
36
# File 'lib/licensee/content_helper.rb', line 34

def max_delta
  @max_delta ||= (length * Licensee.inverse_confidence_threshold).to_i
end

#similarity(other) ⇒ Object

Given another license or project file, calculates the similarity as a percentage of words in common



45
46
47
48
49
# File 'lib/licensee/content_helper.rb', line 45

def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end

#wordsetObject

A set of each word in the license, without duplicates



20
21
22
23
24
# File 'lib/licensee/content_helper.rb', line 20

def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
  end
end