Module: BBLib

Defined in:
lib/string/matching.rb,
lib/bblib.rb,
lib/file/bbfile.rb,
lib/math/bbmath.rb,
lib/time/bbtime.rb,
lib/string/roman.rb,
lib/bblib/version.rb,
lib/string/bbstring.rb,
lib/string/fuzzy_matcher.rb

Overview

String Comparison Algorithms

Defined Under Namespace

Classes: FuzzyMatcher

Constant Summary collapse

CONFIGS_PATH =
'config/'
FILE_SIZES =
{
  byte: { mult: 1, exp: ['b', 'byt', 'byte'] },
  kilobyte: { mult: 1024, exp: ['kb', 'kilo', 'k', 'kbyte', 'kilobyte'] },
  megabyte: { mult: 1048576, exp: ['mb', 'mega', 'm', 'mib', 'mbyte', 'megabyte'] },
  gigabyte: { mult: 1073741824, exp: ['gb', 'giga', 'g', 'gbyte', 'gigabyte'] },
  terabyte: { mult: 1099511627776, exp: ['tb', 'tera', 't', 'tbyte', 'terabyte'] },
  petabyte: { mult: 1125899906842624, exp: ['pb', 'peta', 'p', 'pbyte', 'petabyte'] },
  exabyte: { mult: 1152921504606846976, exp: ['eb', 'exa', 'e', 'ebyte', 'exabyte'] },
  zettabyte: { mult: 1180591620717411303424, exp: ['zb', 'zetta', 'z', 'zbyte', 'zettabyte'] },
  yottabyte: { mult: 1208925819614629174706176, exp: ['yb', 'yotta', 'y', 'ybyte', 'yottabyte'] }
}
TIME_EXPS =
{
  milli: {
    mult: 0.001,
    styles: {full: ' millisecond', medium: ' milli', short: 'ms'},
    exp: ['ms', 'mil', 'mils', 'milli', 'millis', 'millisecond', 'milliseconds', 'milsec', 'milsecs', 'msec', 'msecs', 'msecond', 'mseconds']},
  sec: {
    mult: 1,
    styles: {full: ' second', medium: ' sec', short: 's'},
    exp: ['s', 'sec', 'secs', 'second', 'seconds']},
  min: {
    mult: 60,
    styles: {full: ' minute', medium: ' min', short: 'm'},
    exp: ['m', 'mn', 'mns', 'min', 'mins', 'minute', 'minutes']},
  hour: {
    mult: 3600,
    styles: {full: ' hour', medium: ' hr', short: 'h'},
    exp: ['h', 'hr', 'hrs', 'hour', 'hours']},
  day: {
    mult: 86400,
    styles: {full: ' day', medium: ' day', short: 'd'},
    exp: ['d', 'day' 'days']},
  week: {
    mult: 604800,
    styles: {full: ' week', medium: ' wk', short: 'w'},
    exp: ['w', 'wk', 'wks', 'week', 'weeks']},
  month: {
    mult: 2592000,
    styles: {full: ' month', medium: ' mo', short: 'mo'},
    exp: ['mo', 'mon', 'mons', 'month', 'months', 'mnth', 'mnths', 'mth', 'mths']},
  year: {
    mult: 31536000,
    styles: {full: ' year', medium: ' yr', short: 'y'},
    exp: ['y', 'yr', 'yrs', 'year', 'years']}
}
VERSION =
"0.1.1"

Class Method Summary collapse

Class Method Details

.composition_similarity(a, b) ⇒ Object

Calculates a percentage based match of two strings based on their character composition.



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/string/matching.rb', line 27

def self.composition_similarity a, b
  if a.length <= b.length then t = a; a = b; b = t; end
  matches, temp = 0, b
  a.chars.each do |c|
    if temp.chars.include? c
      matches+=1
      temp.sub! c, ''
    end
  end
  (matches / [a.length, b.length].max.to_f )* 100.0
end

.drop_symbols(str) ⇒ Object

Quickly remove any symbols from a string leaving onl alpha-numeric characters and white space.



13
14
15
# File 'lib/string/bbstring.rb', line 13

def self.drop_symbols str
  str.gsub(/[^\w\s\d]|_/, '')
end

.extract_floats(str, convert: true) ⇒ Object

Extracts all integers or decimals from a string into an array.



23
24
25
# File 'lib/string/bbstring.rb', line 23

def self.extract_floats str, convert: true
  str.scan(/\d+\.?\d+|\d+/).map{ |f| convert ? f.to_f : f }
end

.extract_integers(str, convert: true) ⇒ Object

Extract all integers from a string. Use extract_floats if numbers may contain decimal places.



18
19
20
# File 'lib/string/bbstring.rb', line 18

def self.extract_integers str, convert: true
  str.scan(/\d+/).map{ |d| convert ? d.to_i : d }
end

.extract_numbers(str, convert: true) ⇒ Object

Alias for extract_floats



28
29
30
# File 'lib/string/bbstring.rb', line 28

def self.extract_numbers str, convert: true
  BBLib.extract_floats str, convert:convert
end

.from_roman(str) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/string/roman.rb', line 32

def self.from_roman str
  sp = str.split(' ')
  (0..1000).each do |n|
    num = BBLib.to_roman n
    if !sp.select{ |i| i[/#{num}/i]}.empty?
      for i in 0..(sp.length-1)
        if sp[i].drop_symbols.upcase == num
          sp[i].sub!(num ,n.to_s)
        end
      end
    end
  end
  sp.join ' '
end

.keep_between(num, min, max) ⇒ Object

Used to keep any numeric number between a set of bounds. Passing nil as min or max represents no bounds in that direction. min and max are inclusive to the allowed bounds.



5
6
7
8
9
10
# File 'lib/math/bbmath.rb', line 5

def self.keep_between num, min, max
  raise "Argument must be numeric: #{num} (#{num.class})" unless Numeric === num
  if !min.nil? && num < min then num = min end
  if !max.nil? && num > max then num = max end
  return num
end

.levenshtein_distance(a, b) ⇒ Object

A simple rendition of the levenshtein distance algorithm



8
9
10
11
12
13
14
15
16
17
# File 'lib/string/matching.rb', line 8

def self.levenshtein_distance a, b
  costs = (0..b.length).to_a
  (1..a.length).each do |i|
    costs[0], nw = i, i - 1
    (1..b.length).each do |j|
      costs[j], nw = [costs[j] + 1, costs[j-1] + 1, a[i-1] == b[j-1] ? nw : nw + 1].min, costs[j]
    end
  end
  costs[b.length]
end

.levenshtein_similarity(a, b) ⇒ Object

Calculates a percentage based match using the levenshtein distance algorithm



20
21
22
23
24
# File 'lib/string/matching.rb', line 20

def self.levenshtein_similarity a, b
  distance = BBLib.levenshtein_distance a, b
  max = [a.length, b.length].max.to_f
  return ((max - distance.to_f) / max) * 100.0
end

.move_articles(str, position = :front, capitalize: true) ⇒ Object

Used to move the position of the articles ‘the’, ‘a’ and ‘an’ in strings for normalization.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/string/bbstring.rb', line 33

def self.move_articles str, position = :front, capitalize: true
  return str unless [:front, :back, :none].include? position
  articles = ["the", "a", "an"]
  articles.each do |a|
    starts, ends = str.downcase.start_with?(a + ' '), str.downcase.end_with?(' ' + a)
    if starts && position != :front
      if position == :none
        str = str[(a.length + 1)..str.length]
      elsif position == :back
        str = str[(a.length + 1)..str.length] + (!ends ? ", #{capitalize ? a.capitalize : a}" : '')
      end
    end
    if ends && position != :back
      if position == :none
        str = str[0..-(a.length + 2)]
      elsif position == :front
        str = (!starts ? "#{capitalize ? a.capitalize : a} " : '') + str[0..-(a.length + 2)]
      end
    end
  end
  while str.strip.end_with?(',')
    str.strip!
    str.chop!
  end
  str
end

.numeric_similarity(a, b) ⇒ Object

Extracts all numbers from two strings and compares them and generates a percentage of match. Percentage calculations here need to be weighted better…TODO



54
55
56
57
58
59
60
61
62
# File 'lib/string/matching.rb', line 54

def self.numeric_similarity a, b
  a, b = a.extract_numbers, b.extract_numbers
  return 100.0 if a.empty? && b.empty?
  matches = []
  for i in 0..[a.size, b.size].max-1
    matches << 1.0 / ([a[i].to_f, b[i].to_f].max - [a[i].to_f, b[i].to_f].min + 1.0)
  end
  (matches.inject{ |sum, m| sum + m } / matches.size.to_f) * 100.0
end

.parse_duration(str, output: :sec) ⇒ Object

Parses known time based patterns out of a string to construct a numeric duration.



6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/time/bbtime.rb', line 6

def self.parse_duration str, output: :sec
  secs = 0.0
  TIME_EXPS.each do |k, v|
    v[:exp].each do |e|
      numbers = str.downcase.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}(?=\W|\d|\z)/i)
      numbers.each do |n|
        secs+= n.to_i * v[:mult]
      end
    end
  end
  secs / (TIME_EXPS[output][:mult].to_f rescue 1)
end

.parse_file_size(str, output: :byte) ⇒ Object

A file size parser for strings. Extracts any known patterns for file sizes.



34
35
36
37
38
39
40
41
42
43
44
# File 'lib/file/bbfile.rb', line 34

def self.parse_file_size str, output: :byte
  output = FILE_SIZES.keys.find{ |f| f == output || FILE_SIZES[f][:exp].include?(output.to_s.downcase) } || :byte
  bytes = 0.0
  FILE_SIZES.each do |k, v|
    v[:exp].each do |e|
      numbers = str.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}s?(?=\W|\d|\z)/i)
      numbers.each{ |n| bytes+= n.to_f * v[:mult] }
    end
  end
  return bytes / FILE_SIZES[output][:mult]
end

.phrase_similarity(a, b) ⇒ Object

Calculates a percentage based match between two strings based on the similarity of word matches.



40
41
42
43
44
45
46
47
48
49
50
# File 'lib/string/matching.rb', line 40

def self.phrase_similarity a, b
  temp = b.drop_symbols.split ' '
  matches = 0
  a.drop_symbols.split(' ').each do |w|
    if temp.include? w
      matches+=1
      temp.delete_at temp.find_index w
    end
  end
  (matches.to_f / [a.split(' ').size, b.split(' ').size].max.to_f) * 100.0
end

.qwerty_distance(a, b) ⇒ Object

A simple character distance calculator that uses qwerty key positions to determine how similar two strings are. May be useful for typo detection.



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/string/matching.rb', line 66

def self.qwerty_distance a, b
  a, b = a.downcase.strip, b.downcase.strip
  if a.length <= b.length then t = a; a = b; b = t; end
  qwerty = {
    1 => ['1','2','3','4','5','6','7','8','9','0'],
    2 => ['q','w','e','r','t','y','u','i','o','p'],
    3 => ['a','s','d','f','g','h','j','k','l'],
    4 => ['z','x','c','v','b','n','m']
  }
  count, offset = 0, 0
  a.chars.each do |c|
    if b.length <= count
      offset+=10
    else
      ai = qwerty.keys.find{ |f| qwerty[f].include? c }.to_i
      bi = qwerty.keys.find{ |f| qwerty[f].include? b.chars[count] }.to_i
      offset+= (ai - bi).abs
      offset+= (qwerty[ai].index(c) - qwerty[bi].index(b.chars[count])).abs
    end
    count+=1
  end
  offset
end

.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ Object

Scan for files and directories. Can be set to be recursive and can also have filters applied.



6
7
8
9
10
11
12
13
# File 'lib/file/bbfile.rb', line 6

def self.scan_dir path = Dir.pwd, filter: nil, recursive: false
  if !filter.nil?
    filter = [filter].flatten.map{ |f| path.to_s + (recursive ? '/**/' : '/') + f.to_s }
  else
    filter = path.to_s + (recursive ? '/**/*' : '/*')
  end
  Dir.glob(filter)
end

.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ Object

Uses BBLib.scan_dir but returns only directories. Mode can be used to return strings (:path) or Dir objects (:dir)



21
22
23
# File 'lib/file/bbfile.rb', line 21

def self.scan_dirs path, filter: nil, recursive: false, mode: :path
  BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.directory?(f) ? (mode == :dir ? Dir.new(f) : f ) : nil}.reject{ |r| r.nil? }
end

.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ Object

Uses BBLib.scan_dir but returns only files. Mode can be used to return strings (:path) or File objects (:file)



16
17
18
# File 'lib/file/bbfile.rb', line 16

def self.scan_files path, filter: nil, recursive: false, mode: :path
  BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.file?(f) ? (mode == :file ? File.new(f) : f) : nil}.reject{ |r| r.nil? }
end

.string_to_file(path, str, mkpath = true) ⇒ Object

Shorthand method to write a string to dist. By default the path is created if it doesn’t exist.



26
27
28
29
30
31
# File 'lib/file/bbfile.rb', line 26

def self.string_to_file path, str, mkpath = true
  if !Dir.exists?(path) && mkpath
    FileUtils.mkpath File.dirname(path)
  end
  File.write(path, str.to_s)
end

.string_to_roman(str) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
# File 'lib/string/roman.rb', line 19

def self.string_to_roman str
  sp = str.split ' '
  sp.map! do |s|
    if s.drop_symbols.to_i.to_s == s.drop_symbols && !(s =~ /\d+\.\d+/)
      s.sub!(s.scan(/\d+/).first.to_s, BBLib.to_roman(s.to_i))
    else
      s
    end
  end
  sp.join ' '
end

.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ Object

Turns a numeric input into a time string.



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/time/bbtime.rb', line 20

def self.to_duration num, input: :sec, stop: :milli, style: :medium
  return nil unless Numeric === num || num > 0
  if ![:full, :medium, :short].include?(style) then style = :medium end
  expression = []
  n, done = num * TIME_EXPS[input.to_sym][:mult], false
  TIME_EXPS.reverse.each do |k, v|
    next unless !done
    div = n / v[:mult]
    if div > 1
      expression << "#{div.floor}#{v[:styles][style]}#{div.floor > 1 && style != :short ? "s" : nil}"
      n-= div.floor * v[:mult]
    end
    if k == stop then done = true end
  end
  expression.join ' '
end

.to_roman(num) ⇒ Object

Converts any integer up to 1000 to a roman numeral string_a



5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/string/roman.rb', line 5

def self.to_roman num
  return num.to_s if num > 1000
   roman = {1000 => 'M', 900 => 'CM', 500 => 'D', 400 => 'CD', 100 => 'C', 90 => 'XC', 50 => 'L',
            40 => 'XL', 10 => 'X', 9 => 'IX', 5 => 'V', 4 => 'IV', 3 => 'III', 2 => 'II', 1 => 'I'}
  numeral = ""
  roman.each do |n, r|
    while num >= n
      num-= n
      numeral+= r
    end
  end
  numeral
end