Class: License_recognition

Inherits:
Object
  • Object
show all
Defined in:
lib/license_auto/license/similarity.rb

Overview

TODO: move to units.rb

Instance Method Summary collapse

Constructor Details

#initialize(path = '') ⇒ License_recognition

Returns a new instance of License_recognition.



3
4
5
6
7
8
9
10
11
12
13
14
15
# File 'lib/license_auto/license/similarity.rb', line 3

def initialize(path = '')
  @license_text = ''
  @local_license_list = Array.new
  @local_license_path = path
  # TODO: First step, chose the 3 most possible licenses text by keywords; Second step is loop mathing all texts
  # TODO: Find keywords of license text
  @sorted_frequency     = ['MIT','MIT2.0','Apache2.0','RubyClause-6','BSD',
                         'GPL2.0','GPL3.0','LGPL2.1','LGPL3.0']   # Often used license name
  @license_extension  = ".txt"      # Local license file extensions
  @similar_list       = Array.new
  @overload           = 20000       # Text is too long, unable to identify
  @condition          = 0.85        # Similarity value
end

Instance Method Details

#edit_distance(a, b) ⇒ Object

description : edit distance



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/license_auto/license/similarity.rb', line 34

def edit_distance(a, b)
  array = Array.new(2){Array.new(a.size+1)}
  array[0][0] = 0
  for i in (1 .. a.size)
    array[0][i] = i
  end

  for i in (1 .. b.size)
    array[i%2][0] = i
    for j in (1 .. a.size)
      if b[i - 1] == a[j - 1]
        array[i%2][j] = array[(i - 1)%2][j - 1]
      else
        array[i%2][j] = min(array[i%2][j - 1],array[(i - 1)%2][j - 1],array[(i - 1)%2][j]) + 1
      end
    end
  end
  return array[i%2][j]

end

#extract_license_text_from_readme(readme) ⇒ Object

def similarity



177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/license_auto/license/similarity.rb', line 177

def extract_license_text_from_readme(readme)
  if File.extname(readme['name']) == '.rdoc'
    regular_start = /^==[ *](copying|copy|license){1}:*/i
    regular_end   = /^== /
  elsif File.extname(readme['name']) == '.md'
    regular_start = /^##[ *](copying|copy|license){1}:*/i
    regular_end   = /^## /
  else
    return nil
  end

end

#get_local_license(path = @local_license_path) ⇒ Object

description : Get all the local license file path path : Local license folder



97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/license_auto/license/similarity.rb', line 97

def get_local_license(path = @local_license_path)
  #p @sorted_frequency
  if File.directory?(path)
    Dir.foreach(path) do |file|
      if file != "." and file != ".." and !File.directory?(file) and File.extname(file) == @license_extension
        @local_license_list << [File.expand_path(path + '/' + file), File.basename(file,@license_extension)]
      end
    end
  else
    raise("path: #{path} not found!")
  end
  sequence()
  return @local_license_list
end

#longest_common_substring(a, b) ⇒ Object

description : longest common substring



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/license_auto/license/similarity.rb', line 56

def longest_common_substring(a, b)
  array = Array.new(2){Array.new(a.size+1)}
  array[0][0] = 0
  for i in (1 .. a.size)
    array[0][i] = 0
  end

  for i in (1 .. b.size)
    array[i%2][0] = 0
    for j in (1 .. a.size)
      if b[i - 1] == a[j - 1]
        array[i%2][j] = array[(i - 1)%2][j - 1] + 1
      else
        array[i%2][j] = max(array[i%2][j - 1],array[(i - 1)%2][j - 1],array[(i - 1)%2][j])
      end
    end
  end
  return array[i%2][j]
end

#max(a, b, c) ⇒ Object

description : Find the largest



26
27
28
29
30
31
# File 'lib/license_auto/license/similarity.rb', line 26

def max(a, b, c)
  i = a;
  i = b if i < b
  i = c if i < c
  return i
end

#min(a, b, c) ⇒ Object

description : Find the smallest



18
19
20
21
22
23
# File 'lib/license_auto/license/similarity.rb', line 18

def min(a, b, c)
  i = a;
  i = b if i > b
  i = c if i > c
  return i
end

#sequence(constant = @sorted_frequency, change = @local_license_list) ⇒ Object

description : License name list is sorted, commonly used on the front constant : License often used list change : Waiting list license change



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/license_auto/license/similarity.rb', line 79

def sequence(constant = @sorted_frequency, change = @local_license_list)
  i = 0
  for j in (0 ... constant.size)
    #p constant[j]
    for k in (i ... change.size)
      if constant[j] == change[k][1]
        tmp = change[i]
        change[i] = change[k]
        change[k] = tmp
        i += 1
        break
      end
    end
  end
end

#similarity(packge_license, path) ⇒ Object

description : similarity 0% : Not the same 100% : The same packge_license : Unrecognized text path : local license text



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/license_auto/license/similarity.rb', line 136

def similarity(packge_license, path)

  get_local_license(path)

  package_licen_data = packge_license.scan(/\w+/)
  # Text is too long, unable to identify, then return null
  if package_licen_data.size > @overload
    return nil
  end
  local_license_date = Array.new
  @local_license_list.each do |license|
    local_license_date.clear
    local_file = File.readlines(license[0])
    local_file.each do |line|
      local_license_date.concat(line.scan(/\w+/))
    end
    ed = edit_distance(package_licen_data,local_license_date)
    lcs = longest_common_substring(package_licen_data,local_license_date)
    similar = (lcs + 0.0)/(ed + lcs)
    #p license[1]
    tmp = [similar, license[1], "ed[#{ed}]", "lcs[#{lcs}]", "web[#{package_licen_data.size}]", "local[#{local_license_date.size}]"]
    sort_insert(tmp)

    if similar > @condition
      return license[1]
    end
  end

  # p @similar_list

  if @similar_list.size == 0
    return nil
  elsif @similar_list[0][0] > 0.76
    return @similar_list[0][1]
  elsif @similar_list[0][0] > 0.45
    return @similar_list[0][1]
  else
    return nil
  end
end

#sort_insert(data) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/license_auto/license/similarity.rb', line 113

def sort_insert(data)
  flag = false
  if @similar_list.size == 0
    @similar_list << data
  else
    for i in (0 ... @similar_list.size)
      if data[0] > @similar_list[i][0]
        @similar_list.insert(i,data)
        flag = true
        break
      end
    end
    if false == flag
      @similar_list << data
    end
  end
end