Module: Licensee::ContentHelper
- Included in:
- License, ProjectFiles::LicenseFile
- Defined in:
- lib/licensee/content_helper.rb
Constant Summary collapse
- DIGEST =
Digest::SHA1
- END_OF_TERMS_REGEX =
/^[\s#*_]*end of terms and conditions\s*$/i
- HR_REGEX =
/[=\-\*][=\-\*\s]{3,}/
- ALT_TITLE_REGEX =
License::ALT_TITLE_REGEX
- ALL_RIGHTS_RESERVED_REGEX =
/\Aall rights reserved\.?$/i
- WHITESPACE_REGEX =
/\s+/
- MARKDOWN_HEADING_REGEX =
/\A\s*#+/
- VERSION_REGEX =
/\Aversion.*$/i
- MARKUP_REGEX =
/[#_*=~\[\]()`|>]+/
- DEVELOPED_BY_REGEX =
/\Adeveloped by:.*?\n\n/im
- QUOTE_BEGIN_REGEX =
/[`'"‘“]/
- QUOTE_END_REGEX =
/['"’”]/
Class Method Summary collapse
- .format_percent(float) ⇒ Object
- .title_regex ⇒ Object
-
.wrap(text, line_width = 80) ⇒ Object
Wrap text to the given line length.
Instance Method Summary collapse
-
#content_hash ⇒ Object
SHA1 of the normalized content.
-
#content_normalized(wrap: nil) ⇒ Object
Content without title, version, copyright, whitespace, or insturctions.
-
#content_without_title_and_version ⇒ Object
Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile.
-
#length ⇒ Object
Number of characteres in the normalized content.
-
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length.
-
#max_delta ⇒ Object
Number of characters that could be added/removed to still be considered a potential match.
-
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common.
-
#wordset ⇒ Object
A set of each word in the license, without duplicates.
Class Method Details
.format_percent(float) ⇒ Object
116 117 118 |
# File 'lib/licensee/content_helper.rb', line 116 def self.format_percent(float) "#{format('%.2f', float)}%" end |
.title_regex ⇒ Object
120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/licensee/content_helper.rb', line 120 def self.title_regex licenses = Licensee::License.all(hidden: true, psuedo: false) titles = licenses.map(&:title_regex) # Title regex must include the version to support matching within # families, but for sake of normalization, we can be less strict without_versions = licenses.map do |license| next if license.title == license.name_without_version Regexp.new Regexp.escape(license.name_without_version), 'i' end titles.concat(without_versions.compact) /\A\s*\(?(the )?#{Regexp.union titles}.*$/i end |
.wrap(text, line_width = 80) ⇒ Object
Wrap text to the given line length
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/licensee/content_helper.rb', line 100 def self.wrap(text, line_width = 80) return if text.nil? text = text.clone text.gsub!(/([^\n])\n([^\n])/, '\1 \2') text = text.split("\n").collect do |line| if line.length > line_width line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip else line end end * "\n" text.strip end |
Instance Method Details
#content_hash ⇒ Object
SHA1 of the normalized content
52 53 54 |
# File 'lib/licensee/content_helper.rb', line 52 def content_hash @content_hash ||= DIGEST.hexdigest content_normalized end |
#content_normalized(wrap: nil) ⇒ Object
Content without title, version, copyright, whitespace, or insturctions
wrap - Optional width to wrap the content
Returns a string
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/licensee/content_helper.rb', line 75 def content_normalized(wrap: nil) return unless content @content_normalized ||= begin string = content_without_title_and_version.downcase while string =~ Matchers::Copyright::REGEX string = strip_copyright(string) end string = strip_all_rights_reserved(string) string = strip_developed_by(string) string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX) string = normalize_lists(string) string = normalize_quotes(string) string = normalize_https(string) string = strip_markup(string) strip_whitespace(string) end if wrap.nil? @content_normalized else Licensee::ContentHelper.wrap(@content_normalized, wrap) end end |
#content_without_title_and_version ⇒ Object
Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile
60 61 62 63 64 65 66 67 68 |
# File 'lib/licensee/content_helper.rb', line 60 def content_without_title_and_version @content_without_title_and_version ||= begin string = content.strip string = strip_markdown_headings(string) string = strip_hrs(string) string = strip_title(string) while string =~ ContentHelper.title_regex strip_version(string).strip end end |
#length ⇒ Object
Number of characteres in the normalized content
27 28 29 30 |
# File 'lib/licensee/content_helper.rb', line 27 def length return 0 unless content_normalized content_normalized.length end |
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length
39 40 41 |
# File 'lib/licensee/content_helper.rb', line 39 def length_delta(other) (length - other.length).abs end |
#max_delta ⇒ Object
Number of characters that could be added/removed to still be considered a potential match
34 35 36 |
# File 'lib/licensee/content_helper.rb', line 34 def max_delta @max_delta ||= (length * Licensee.inverse_confidence_threshold).to_i end |
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common
45 46 47 48 49 |
# File 'lib/licensee/content_helper.rb', line 45 def similarity(other) overlap = (wordset & other.wordset).size total = wordset.size + other.wordset.size 100.0 * (overlap * 2.0 / total) end |
#wordset ⇒ Object
A set of each word in the license, without duplicates
20 21 22 23 24 |
# File 'lib/licensee/content_helper.rb', line 20 def wordset @wordset ||= if content_normalized content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set end end |