Class: License_recognition
- Inherits:
-
Object
- Object
- License_recognition
- Defined in:
- lib/license_auto/license/similarity.rb
Overview
TODO: move to units.rb
Instance Method Summary collapse
-
#edit_distance(a, b) ⇒ Object
description : edit distance.
-
#extract_license_text_from_readme(readme) ⇒ Object
def similarity.
-
#get_local_license(path = @local_license_path) ⇒ Object
description : Get all the local license file path path : Local license folder.
-
#initialize(path = '') ⇒ License_recognition
constructor
A new instance of License_recognition.
-
#longest_common_substring(a, b) ⇒ Object
description : longest common substring.
-
#max(a, b, c) ⇒ Object
description : Find the largest.
-
#min(a, b, c) ⇒ Object
description : Find the smallest.
-
#sequence(constant = @sorted_frequency, change = @local_license_list) ⇒ Object
description : License name list is sorted, commonly used on the front constant : License often used list change : Waiting list license change.
-
#similarity(packge_license, path) ⇒ Object
description : similarity 0% : Not the same 100% : The same packge_license : Unrecognized text path : local license text.
- #sort_insert(data) ⇒ Object
Constructor Details
#initialize(path = '') ⇒ License_recognition
Returns a new instance of License_recognition.
3 4 5 6 7 8 9 10 11 12 13 14 15 |
# File 'lib/license_auto/license/similarity.rb', line 3 def initialize(path = '') @license_text = '' @local_license_list = Array.new @local_license_path = path # TODO: First step, chose the 3 most possible licenses text by keywords; Second step is loop mathing all texts # TODO: Find keywords of license text @sorted_frequency = ['MIT','MIT2.0','Apache2.0','RubyClause-6','BSD', 'GPL2.0','GPL3.0','LGPL2.1','LGPL3.0'] # Often used license name @license_extension = ".txt" # Local license file extensions @similar_list = Array.new @overload = 20000 # Text is too long, unable to identify @condition = 0.85 # Similarity value end |
Instance Method Details
#edit_distance(a, b) ⇒ Object
description : edit distance
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/license_auto/license/similarity.rb', line 34 def edit_distance(a, b) array = Array.new(2){Array.new(a.size+1)} array[0][0] = 0 for i in (1 .. a.size) array[0][i] = i end for i in (1 .. b.size) array[i%2][0] = i for j in (1 .. a.size) if b[i - 1] == a[j - 1] array[i%2][j] = array[(i - 1)%2][j - 1] else array[i%2][j] = min(array[i%2][j - 1],array[(i - 1)%2][j - 1],array[(i - 1)%2][j]) + 1 end end end return array[i%2][j] end |
#extract_license_text_from_readme(readme) ⇒ Object
def similarity
177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/license_auto/license/similarity.rb', line 177 def extract_license_text_from_readme(readme) if File.extname(readme['name']) == '.rdoc' regular_start = /^==[ *](copying|copy|license){1}:*/i regular_end = /^== / elsif File.extname(readme['name']) == '.md' regular_start = /^##[ *](copying|copy|license){1}:*/i regular_end = /^## / else return nil end end |
#get_local_license(path = @local_license_path) ⇒ Object
description : Get all the local license file path path : Local license folder
97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/license_auto/license/similarity.rb', line 97 def get_local_license(path = @local_license_path) #p @sorted_frequency if File.directory?(path) Dir.foreach(path) do |file| if file != "." and file != ".." and !File.directory?(file) and File.extname(file) == @license_extension @local_license_list << [File.(path + '/' + file), File.basename(file,@license_extension)] end end else raise("path: #{path} not found!") end sequence() return @local_license_list end |
#longest_common_substring(a, b) ⇒ Object
description : longest common substring
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/license_auto/license/similarity.rb', line 56 def longest_common_substring(a, b) array = Array.new(2){Array.new(a.size+1)} array[0][0] = 0 for i in (1 .. a.size) array[0][i] = 0 end for i in (1 .. b.size) array[i%2][0] = 0 for j in (1 .. a.size) if b[i - 1] == a[j - 1] array[i%2][j] = array[(i - 1)%2][j - 1] + 1 else array[i%2][j] = max(array[i%2][j - 1],array[(i - 1)%2][j - 1],array[(i - 1)%2][j]) end end end return array[i%2][j] end |
#max(a, b, c) ⇒ Object
description : Find the largest
26 27 28 29 30 31 |
# File 'lib/license_auto/license/similarity.rb', line 26 def max(a, b, c) i = a; i = b if i < b i = c if i < c return i end |
#min(a, b, c) ⇒ Object
description : Find the smallest
18 19 20 21 22 23 |
# File 'lib/license_auto/license/similarity.rb', line 18 def min(a, b, c) i = a; i = b if i > b i = c if i > c return i end |
#sequence(constant = @sorted_frequency, change = @local_license_list) ⇒ Object
description : License name list is sorted, commonly used on the front constant : License often used list change : Waiting list license change
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/license_auto/license/similarity.rb', line 79 def sequence(constant = @sorted_frequency, change = @local_license_list) i = 0 for j in (0 ... constant.size) #p constant[j] for k in (i ... change.size) if constant[j] == change[k][1] tmp = change[i] change[i] = change[k] change[k] = tmp i += 1 break end end end end |
#similarity(packge_license, path) ⇒ Object
description : similarity 0% : Not the same 100% : The same packge_license : Unrecognized text path : local license text
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/license_auto/license/similarity.rb', line 136 def similarity(packge_license, path) get_local_license(path) package_licen_data = packge_license.scan(/\w+/) # Text is too long, unable to identify, then return null if package_licen_data.size > @overload return nil end local_license_date = Array.new @local_license_list.each do |license| local_license_date.clear local_file = File.readlines(license[0]) local_file.each do |line| local_license_date.concat(line.scan(/\w+/)) end ed = edit_distance(package_licen_data,local_license_date) lcs = longest_common_substring(package_licen_data,local_license_date) similar = (lcs + 0.0)/(ed + lcs) #p license[1] tmp = [similar, license[1], "ed[#{ed}]", "lcs[#{lcs}]", "web[#{package_licen_data.size}]", "local[#{local_license_date.size}]"] sort_insert(tmp) if similar > @condition return license[1] end end # p @similar_list if @similar_list.size == 0 return nil elsif @similar_list[0][0] > 0.76 return @similar_list[0][1] elsif @similar_list[0][0] > 0.45 return @similar_list[0][1] else return nil end end |
#sort_insert(data) ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/license_auto/license/similarity.rb', line 113 def sort_insert(data) flag = false if @similar_list.size == 0 @similar_list << data else for i in (0 ... @similar_list.size) if data[0] > @similar_list[i][0] @similar_list.insert(i,data) flag = true break end end if false == flag @similar_list << data end end end |