Module: BBLib

Defined in:: lib/string/matching.rb,
lib/bblib.rb,
lib/file/bbfile.rb,
lib/math/bbmath.rb,
lib/time/bbtime.rb,
lib/string/roman.rb,
lib/bblib/version.rb,
lib/string/bbstring.rb,
lib/string/fuzzy_matcher.rb

Overview

String Comparison Algorithms

Defined Under Namespace

Constant Summary collapse

CONFIGS_PATH =

'config/'

FILE_SIZES =

{
  byte: { mult: 1, exp: ['b', 'byt', 'byte'] },
  kilobyte: { mult: 1024, exp: ['kb', 'kilo', 'k', 'kbyte', 'kilobyte'] },
  megabyte: { mult: 1048576, exp: ['mb', 'mega', 'm', 'mib', 'mbyte', 'megabyte'] },
  gigabyte: { mult: 1073741824, exp: ['gb', 'giga', 'g', 'gbyte', 'gigabyte'] },
  terabyte: { mult: 1099511627776, exp: ['tb', 'tera', 't', 'tbyte', 'terabyte'] },
  petabyte: { mult: 1125899906842624, exp: ['pb', 'peta', 'p', 'pbyte', 'petabyte'] },
  exabyte: { mult: 1152921504606846976, exp: ['eb', 'exa', 'e', 'ebyte', 'exabyte'] },
  zettabyte: { mult: 1180591620717411303424, exp: ['zb', 'zetta', 'z', 'zbyte', 'zettabyte'] },
  yottabyte: { mult: 1208925819614629174706176, exp: ['yb', 'yotta', 'y', 'ybyte', 'yottabyte'] }
}

TIME_EXPS =

{
  milli: {
    mult: 0.001,
    styles: {full: ' millisecond', medium: ' milli', short: 'ms'},
    exp: ['ms', 'mil', 'mils', 'milli', 'millis', 'millisecond', 'milliseconds', 'milsec', 'milsecs', 'msec', 'msecs', 'msecond', 'mseconds']},
  sec: {
    mult: 1,
    styles: {full: ' second', medium: ' sec', short: 's'},
    exp: ['s', 'sec', 'secs', 'second', 'seconds']},
  min: {
    mult: 60,
    styles: {full: ' minute', medium: ' min', short: 'm'},
    exp: ['m', 'mn', 'mns', 'min', 'mins', 'minute', 'minutes']},
  hour: {
    mult: 3600,
    styles: {full: ' hour', medium: ' hr', short: 'h'},
    exp: ['h', 'hr', 'hrs', 'hour', 'hours']},
  day: {
    mult: 86400,
    styles: {full: ' day', medium: ' day', short: 'd'},
    exp: ['d', 'day' 'days']},
  week: {
    mult: 604800,
    styles: {full: ' week', medium: ' wk', short: 'w'},
    exp: ['w', 'wk', 'wks', 'week', 'weeks']},
  month: {
    mult: 2592000,
    styles: {full: ' month', medium: ' mo', short: 'mo'},
    exp: ['mo', 'mon', 'mons', 'month', 'months', 'mnth', 'mnths', 'mth', 'mths']},
  year: {
    mult: 31536000,
    styles: {full: ' year', medium: ' yr', short: 'y'},
    exp: ['y', 'yr', 'yrs', 'year', 'years']}
}

VERSION =

"0.1.1"

Class Method Summary collapse

.composition_similarity(a, b) ⇒ Object

Calculates a percentage based match of two strings based on their character composition.
.drop_symbols(str) ⇒ Object

Quickly remove any symbols from a string leaving onl alpha-numeric characters and white space.
.extract_floats(str, convert: true) ⇒ Object

Extracts all integers or decimals from a string into an array.
.extract_integers(str, convert: true) ⇒ Object

Extract all integers from a string.
.extract_numbers(str, convert: true) ⇒ Object

Alias for extract_floats.
.from_roman(str) ⇒ Object
.keep_between(num, min, max) ⇒ Object

Used to keep any numeric number between a set of bounds.
.levenshtein_distance(a, b) ⇒ Object

A simple rendition of the levenshtein distance algorithm.
.levenshtein_similarity(a, b) ⇒ Object

Calculates a percentage based match using the levenshtein distance algorithm.
.move_articles(str, position = :front, capitalize: true) ⇒ Object

Used to move the position of the articles ‘the’, ‘a’ and ‘an’ in strings for normalization.
.numeric_similarity(a, b) ⇒ Object

Extracts all numbers from two strings and compares them and generates a percentage of match.
.parse_duration(str, output: :sec) ⇒ Object

Parses known time based patterns out of a string to construct a numeric duration.
.parse_file_size(str, output: :byte) ⇒ Object

A file size parser for strings.
.phrase_similarity(a, b) ⇒ Object

Calculates a percentage based match between two strings based on the similarity of word matches.
.qwerty_distance(a, b) ⇒ Object

A simple character distance calculator that uses qwerty key positions to determine how similar two strings are.
.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ Object

Scan for files and directories.
.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ Object

Uses BBLib.scan_dir but returns only directories.
.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ Object

Uses BBLib.scan_dir but returns only files.
.string_to_file(path, str, mkpath = true) ⇒ Object

Shorthand method to write a string to dist.
.string_to_roman(str) ⇒ Object
.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ Object

Turns a numeric input into a time string.
.to_roman(num) ⇒ Object

Converts any integer up to 1000 to a roman numeral string_a.

Class Method Details

.composition_similarity(a, b) ⇒ `Object`

Calculates a percentage based match of two strings based on their character composition.

# File 'lib/string/matching.rb', line 27

def self.composition_similarity a, b
  if a.length <= b.length then t = a; a = b; b = t; end
  matches, temp = 0, b
  a.chars.each do |c|
    if temp.chars.include? c
      matches+=1
      temp.sub! c, ''
    end
  end
  (matches / [a.length, b.length].max.to_f )* 100.0
end

.drop_symbols(str) ⇒ `Object`

Quickly remove any symbols from a string leaving onl alpha-numeric characters and white space.



13
14
15

# File 'lib/string/bbstring.rb', line 13

def self.drop_symbols str
  str.gsub(/[^\w\s\d]|_/, '')
end

.extract_floats(str, convert: true) ⇒ `Object`

Extracts all integers or decimals from a string into an array.



23
24
25

# File 'lib/string/bbstring.rb', line 23

def self.extract_floats str, convert: true
  str.scan(/\d+\.?\d+|\d+/).map{ |f| convert ? f.to_f : f }
end

.extract_integers(str, convert: true) ⇒ `Object`

Extract all integers from a string. Use extract_floats if numbers may contain decimal places.



18
19
20

# File 'lib/string/bbstring.rb', line 18

def self.extract_integers str, convert: true
  str.scan(/\d+/).map{ |d| convert ? d.to_i : d }
end

.extract_numbers(str, convert: true) ⇒ `Object`

Alias for extract_floats



28
29
30

# File 'lib/string/bbstring.rb', line 28

def self.extract_numbers str, convert: true
  BBLib.extract_floats str, convert:convert
end

.from_roman(str) ⇒ `Object`

# File 'lib/string/roman.rb', line 32

def self.from_roman str
  sp = str.split(' ')
  (0..1000).each do |n|
    num = BBLib.to_roman n
    if !sp.select{ |i| i[/#{num}/i]}.empty?
      for i in 0..(sp.length-1)
        if sp[i].drop_symbols.upcase == num
          sp[i].sub!(num ,n.to_s)
        end
      end
    end
  end
  sp.join ' '
end

.keep_between(num, min, max) ⇒ `Object`

Used to keep any numeric number between a set of bounds. Passing nil as min or max represents no bounds in that direction. min and max are inclusive to the allowed bounds.

# File 'lib/math/bbmath.rb', line 5

def self.keep_between num, min, max
  raise "Argument must be numeric: #{num} (#{num.class})" unless Numeric === num
  if !min.nil? && num < min then num = min end
  if !max.nil? && num > max then num = max end
  return num
end

.levenshtein_distance(a, b) ⇒ `Object`

A simple rendition of the levenshtein distance algorithm

# File 'lib/string/matching.rb', line 8

def self.levenshtein_distance a, b
  costs = (0..b.length).to_a
  (1..a.length).each do |i|
    costs[0], nw = i, i - 1
    (1..b.length).each do |j|
      costs[j], nw = [costs[j] + 1, costs[j-1] + 1, a[i-1] == b[j-1] ? nw : nw + 1].min, costs[j]
    end
  end
  costs[b.length]
end

.levenshtein_similarity(a, b) ⇒ `Object`

Calculates a percentage based match using the levenshtein distance algorithm

# File 'lib/string/matching.rb', line 20

def self.levenshtein_similarity a, b
  distance = BBLib.levenshtein_distance a, b
  max = [a.length, b.length].max.to_f
  return ((max - distance.to_f) / max) * 100.0
end

.move_articles(str, position = :front, capitalize: true) ⇒ `Object`

Used to move the position of the articles ‘the’, ‘a’ and ‘an’ in strings for normalization.

# File 'lib/string/bbstring.rb', line 33

def self.move_articles str, position = :front, capitalize: true
  return str unless [:front, :back, :none].include? position
  articles = ["the", "a", "an"]
  articles.each do |a|
    starts, ends = str.downcase.start_with?(a + ' '), str.downcase.end_with?(' ' + a)
    if starts && position != :front
      if position == :none
        str = str[(a.length + 1)..str.length]
      elsif position == :back
        str = str[(a.length + 1)..str.length] + (!ends ? ", #{capitalize ? a.capitalize : a}" : '')
      end
    end
    if ends && position != :back
      if position == :none
        str = str[0..-(a.length + 2)]
      elsif position == :front
        str = (!starts ? "#{capitalize ? a.capitalize : a} " : '') + str[0..-(a.length + 2)]
      end
    end
  end
  while str.strip.end_with?(',')
    str.strip!
    str.chop!
  end
  str
end

.numeric_similarity(a, b) ⇒ `Object`

Extracts all numbers from two strings and compares them and generates a percentage of match. Percentage calculations here need to be weighted better…TODO

# File 'lib/string/matching.rb', line 54

def self.numeric_similarity a, b
  a, b = a.extract_numbers, b.extract_numbers
  return 100.0 if a.empty? && b.empty?
  matches = []
  for i in 0..[a.size, b.size].max-1
    matches << 1.0 / ([a[i].to_f, b[i].to_f].max - [a[i].to_f, b[i].to_f].min + 1.0)
  end
  (matches.inject{ |sum, m| sum + m } / matches.size.to_f) * 100.0
end

.parse_duration(str, output: :sec) ⇒ `Object`

Parses known time based patterns out of a string to construct a numeric duration.

# File 'lib/time/bbtime.rb', line 6

def self.parse_duration str, output: :sec
  secs = 0.0
  TIME_EXPS.each do |k, v|
    v[:exp].each do |e|
      numbers = str.downcase.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}(?=\W|\d|\z)/i)
      numbers.each do |n|
        secs+= n.to_i * v[:mult]
      end
    end
  end
  secs / (TIME_EXPS[output][:mult].to_f rescue 1)
end

.parse_file_size(str, output: :byte) ⇒ `Object`

A file size parser for strings. Extracts any known patterns for file sizes.

# File 'lib/file/bbfile.rb', line 34

def self.parse_file_size str, output: :byte
  output = FILE_SIZES.keys.find{ |f| f == output || FILE_SIZES[f][:exp].include?(output.to_s.downcase) } || :byte
  bytes = 0.0
  FILE_SIZES.each do |k, v|
    v[:exp].each do |e|
      numbers = str.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}s?(?=\W|\d|\z)/i)
      numbers.each{ |n| bytes+= n.to_f * v[:mult] }
    end
  end
  return bytes / FILE_SIZES[output][:mult]
end

.phrase_similarity(a, b) ⇒ `Object`

Calculates a percentage based match between two strings based on the similarity of word matches.

# File 'lib/string/matching.rb', line 40

def self.phrase_similarity a, b
  temp = b.drop_symbols.split ' '
  matches = 0
  a.drop_symbols.split(' ').each do |w|
    if temp.include? w
      matches+=1
      temp.delete_at temp.find_index w
    end
  end
  (matches.to_f / [a.split(' ').size, b.split(' ').size].max.to_f) * 100.0
end

.qwerty_distance(a, b) ⇒ `Object`

A simple character distance calculator that uses qwerty key positions to determine how similar two strings are. May be useful for typo detection.

# File 'lib/string/matching.rb', line 66

def self.qwerty_distance a, b
  a, b = a.downcase.strip, b.downcase.strip
  if a.length <= b.length then t = a; a = b; b = t; end
  qwerty = {
    1 => ['1','2','3','4','5','6','7','8','9','0'],
    2 => ['q','w','e','r','t','y','u','i','o','p'],
    3 => ['a','s','d','f','g','h','j','k','l'],
    4 => ['z','x','c','v','b','n','m']
  }
  count, offset = 0, 0
  a.chars.each do |c|
    if b.length <= count
      offset+=10
    else
      ai = qwerty.keys.find{ |f| qwerty[f].include? c }.to_i
      bi = qwerty.keys.find{ |f| qwerty[f].include? b.chars[count] }.to_i
      offset+= (ai - bi).abs
      offset+= (qwerty[ai].index(c) - qwerty[bi].index(b.chars[count])).abs
    end
    count+=1
  end
  offset
end

.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ `Object`

Scan for files and directories. Can be set to be recursive and can also have filters applied.

# File 'lib/file/bbfile.rb', line 6

def self.scan_dir path = Dir.pwd, filter: nil, recursive: false
  if !filter.nil?
    filter = [filter].flatten.map{ |f| path.to_s + (recursive ? '/**/' : '/') + f.to_s }
  else
    filter = path.to_s + (recursive ? '/**/*' : '/*')
  end
  Dir.glob(filter)
end

.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ `Object`

Uses BBLib.scan_dir but returns only directories. Mode can be used to return strings (:path) or Dir objects (:dir)



21
22
23

# File 'lib/file/bbfile.rb', line 21

def self.scan_dirs path, filter: nil, recursive: false, mode: :path
  BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.directory?(f) ? (mode == :dir ? Dir.new(f) : f ) : nil}.reject{ |r| r.nil? }
end

.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ `Object`

Uses BBLib.scan_dir but returns only files. Mode can be used to return strings (:path) or File objects (:file)



16
17
18

# File 'lib/file/bbfile.rb', line 16

def self.scan_files path, filter: nil, recursive: false, mode: :path
  BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.file?(f) ? (mode == :file ? File.new(f) : f) : nil}.reject{ |r| r.nil? }
end

.string_to_file(path, str, mkpath = true) ⇒ `Object`

Shorthand method to write a string to dist. By default the path is created if it doesn’t exist.

# File 'lib/file/bbfile.rb', line 26

def self.string_to_file path, str, mkpath = true
  if !Dir.exists?(path) && mkpath
    FileUtils.mkpath File.dirname(path)
  end
  File.write(path, str.to_s)
end

.string_to_roman(str) ⇒ `Object`

# File 'lib/string/roman.rb', line 19

def self.string_to_roman str
  sp = str.split ' '
  sp.map! do |s|
    if s.drop_symbols.to_i.to_s == s.drop_symbols && !(s =~ /\d+\.\d+/)
      s.sub!(s.scan(/\d+/).first.to_s, BBLib.to_roman(s.to_i))
    else
      s
    end
  end
  sp.join ' '
end

.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ `Object`

Turns a numeric input into a time string.

# File 'lib/time/bbtime.rb', line 20

def self.to_duration num, input: :sec, stop: :milli, style: :medium
  return nil unless Numeric === num || num > 0
  if ![:full, :medium, :short].include?(style) then style = :medium end
  expression = []
  n, done = num * TIME_EXPS[input.to_sym][:mult], false
  TIME_EXPS.reverse.each do |k, v|
    next unless !done
    div = n / v[:mult]
    if div > 1
      expression << "#{div.floor}#{v[:styles][style]}#{div.floor > 1 && style != :short ? "s" : nil}"
      n-= div.floor * v[:mult]
    end
    if k == stop then done = true end
  end
  expression.join ' '
end

.to_roman(num) ⇒ `Object`

Converts any integer up to 1000 to a roman numeral string_a

# File 'lib/string/roman.rb', line 5

def self.to_roman num
  return num.to_s if num > 1000
   roman = {1000 => 'M', 900 => 'CM', 500 => 'D', 400 => 'CD', 100 => 'C', 90 => 'XC', 50 => 'L',
            40 => 'XL', 10 => 'X', 9 => 'IX', 5 => 'V', 4 => 'IV', 3 => 'III', 2 => 'II', 1 => 'I'}
  numeral = ""
  roman.each do |n, r|
    while num >= n
      num-= n
      numeral+= r
    end
  end
  numeral
end

Module: BBLib

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.composition_similarity(a, b) ⇒ Object

.drop_symbols(str) ⇒ Object

.extract_floats(str, convert: true) ⇒ Object

.extract_integers(str, convert: true) ⇒ Object

.extract_numbers(str, convert: true) ⇒ Object

.from_roman(str) ⇒ Object

.keep_between(num, min, max) ⇒ Object

.levenshtein_distance(a, b) ⇒ Object

.levenshtein_similarity(a, b) ⇒ Object

.move_articles(str, position = :front, capitalize: true) ⇒ Object

.numeric_similarity(a, b) ⇒ Object

.parse_duration(str, output: :sec) ⇒ Object

.parse_file_size(str, output: :byte) ⇒ Object

.phrase_similarity(a, b) ⇒ Object

.qwerty_distance(a, b) ⇒ Object

.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ Object

.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ Object

.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ Object

.string_to_file(path, str, mkpath = true) ⇒ Object

.string_to_roman(str) ⇒ Object

.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ Object

.to_roman(num) ⇒ Object