Module: Licensee::ContentHelper
- Included in:
- License, Project::LicenseFile
- Defined in:
- lib/licensee/content_helper.rb
Constant Summary collapse
- DIGEST =
Digest::SHA1
- END_OF_TERMS_REGEX =
/^\s*end of terms and conditions\s*$/i
Instance Method Summary collapse
-
#content_normalized ⇒ Object
Content with copyright header and linebreaks removed.
-
#hash ⇒ Object
SHA1 of the normalized content.
-
#length ⇒ Object
Number of characteres in the normalized content.
-
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length.
-
#max_delta ⇒ Object
Number of characters that could be added/removed to still be considered a potential match.
-
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common.
-
#wordset ⇒ Object
A set of each word in the license, without duplicates.
Instance Method Details
#content_normalized ⇒ Object
Content with copyright header and linebreaks removed
47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/licensee/content_helper.rb', line 47 def content_normalized return unless content @content_normalized ||= begin string = content.downcase.strip string = strip_title(string) while string =~ title_regex string = strip_version(string) string = strip_copyright(string) string = strip_hrs(string) string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX) strip_whitespace(string) end end |
#hash ⇒ Object
SHA1 of the normalized content
42 43 44 |
# File 'lib/licensee/content_helper.rb', line 42 def hash @hash ||= DIGEST.hexdigest content_normalized end |
#length ⇒ Object
Number of characteres in the normalized content
17 18 19 20 |
# File 'lib/licensee/content_helper.rb', line 17 def length return 0 unless content_normalized content_normalized.length end |
#length_delta(other) ⇒ Object
Given another license or project file, calculates the difference in length
29 30 31 |
# File 'lib/licensee/content_helper.rb', line 29 def length_delta(other) (length - other.length).abs end |
#max_delta ⇒ Object
Number of characters that could be added/removed to still be considered a potential match
24 25 26 |
# File 'lib/licensee/content_helper.rb', line 24 def max_delta (length * Licensee.inverse_confidence_threshold).to_i end |
#similarity(other) ⇒ Object
Given another license or project file, calculates the similarity as a percentage of words in common
35 36 37 38 39 |
# File 'lib/licensee/content_helper.rb', line 35 def similarity(other) overlap = (wordset & other.wordset).size total = wordset.size + other.wordset.size 100.0 * (overlap * 2.0 / total) end |
#wordset ⇒ Object
A set of each word in the license, without duplicates
10 11 12 13 14 |
# File 'lib/licensee/content_helper.rb', line 10 def wordset @wordset ||= if content_normalized content_normalized.scan(/[\w']+/).to_set end end |