Module: BBLib
- Defined in:
- lib/string/matching.rb,
lib/bblib.rb,
lib/file/bbfile.rb,
lib/math/bbmath.rb,
lib/time/bbtime.rb,
lib/string/roman.rb,
lib/bblib/version.rb,
lib/string/bbstring.rb,
lib/string/fuzzy_matcher.rb
Overview
String Comparison Algorithms
Defined Under Namespace
Classes: FuzzyMatcher
Constant Summary collapse
- CONFIGS_PATH =
'config/'- FILE_SIZES =
{ byte: { mult: 1, exp: ['b', 'byt', 'byte'] }, kilobyte: { mult: 1024, exp: ['kb', 'kilo', 'k', 'kbyte', 'kilobyte'] }, megabyte: { mult: 1048576, exp: ['mb', 'mega', 'm', 'mib', 'mbyte', 'megabyte'] }, gigabyte: { mult: 1073741824, exp: ['gb', 'giga', 'g', 'gbyte', 'gigabyte'] }, terabyte: { mult: 1099511627776, exp: ['tb', 'tera', 't', 'tbyte', 'terabyte'] }, petabyte: { mult: 1125899906842624, exp: ['pb', 'peta', 'p', 'pbyte', 'petabyte'] }, exabyte: { mult: 1152921504606846976, exp: ['eb', 'exa', 'e', 'ebyte', 'exabyte'] }, zettabyte: { mult: 1180591620717411303424, exp: ['zb', 'zetta', 'z', 'zbyte', 'zettabyte'] }, yottabyte: { mult: 1208925819614629174706176, exp: ['yb', 'yotta', 'y', 'ybyte', 'yottabyte'] } }
- TIME_EXPS =
{ milli: { mult: 0.001, styles: {full: ' millisecond', medium: ' milli', short: 'ms'}, exp: ['ms', 'mil', 'mils', 'milli', 'millis', 'millisecond', 'milliseconds', 'milsec', 'milsecs', 'msec', 'msecs', 'msecond', 'mseconds']}, sec: { mult: 1, styles: {full: ' second', medium: ' sec', short: 's'}, exp: ['s', 'sec', 'secs', 'second', 'seconds']}, min: { mult: 60, styles: {full: ' minute', medium: ' min', short: 'm'}, exp: ['m', 'mn', 'mns', 'min', 'mins', 'minute', 'minutes']}, hour: { mult: 3600, styles: {full: ' hour', medium: ' hr', short: 'h'}, exp: ['h', 'hr', 'hrs', 'hour', 'hours']}, day: { mult: 86400, styles: {full: ' day', medium: ' day', short: 'd'}, exp: ['d', 'day' 'days']}, week: { mult: 604800, styles: {full: ' week', medium: ' wk', short: 'w'}, exp: ['w', 'wk', 'wks', 'week', 'weeks']}, month: { mult: 2592000, styles: {full: ' month', medium: ' mo', short: 'mo'}, exp: ['mo', 'mon', 'mons', 'month', 'months', 'mnth', 'mnths', 'mth', 'mths']}, year: { mult: 31536000, styles: {full: ' year', medium: ' yr', short: 'y'}, exp: ['y', 'yr', 'yrs', 'year', 'years']} }
- VERSION =
"0.1.1"
Class Method Summary collapse
-
.composition_similarity(a, b) ⇒ Object
Calculates a percentage based match of two strings based on their character composition.
-
.drop_symbols(str) ⇒ Object
Quickly remove any symbols from a string leaving onl alpha-numeric characters and white space.
-
.extract_floats(str, convert: true) ⇒ Object
Extracts all integers or decimals from a string into an array.
-
.extract_integers(str, convert: true) ⇒ Object
Extract all integers from a string.
-
.extract_numbers(str, convert: true) ⇒ Object
Alias for extract_floats.
- .from_roman(str) ⇒ Object
-
.keep_between(num, min, max) ⇒ Object
Used to keep any numeric number between a set of bounds.
-
.levenshtein_distance(a, b) ⇒ Object
A simple rendition of the levenshtein distance algorithm.
-
.levenshtein_similarity(a, b) ⇒ Object
Calculates a percentage based match using the levenshtein distance algorithm.
-
.move_articles(str, position = :front, capitalize: true) ⇒ Object
Used to move the position of the articles ‘the’, ‘a’ and ‘an’ in strings for normalization.
-
.numeric_similarity(a, b) ⇒ Object
Extracts all numbers from two strings and compares them and generates a percentage of match.
-
.parse_duration(str, output: :sec) ⇒ Object
Parses known time based patterns out of a string to construct a numeric duration.
-
.parse_file_size(str, output: :byte) ⇒ Object
A file size parser for strings.
-
.phrase_similarity(a, b) ⇒ Object
Calculates a percentage based match between two strings based on the similarity of word matches.
-
.qwerty_distance(a, b) ⇒ Object
A simple character distance calculator that uses qwerty key positions to determine how similar two strings are.
-
.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ Object
Scan for files and directories.
-
.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ Object
Uses BBLib.scan_dir but returns only directories.
-
.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ Object
Uses BBLib.scan_dir but returns only files.
-
.string_to_file(path, str, mkpath = true) ⇒ Object
Shorthand method to write a string to dist.
- .string_to_roman(str) ⇒ Object
-
.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ Object
Turns a numeric input into a time string.
-
.to_roman(num) ⇒ Object
Converts any integer up to 1000 to a roman numeral string_a.
Class Method Details
.composition_similarity(a, b) ⇒ Object
Calculates a percentage based match of two strings based on their character composition.
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/string/matching.rb', line 27 def self.composition_similarity a, b if a.length <= b.length then t = a; a = b; b = t; end matches, temp = 0, b a.chars.each do |c| if temp.chars.include? c matches+=1 temp.sub! c, '' end end (matches / [a.length, b.length].max.to_f )* 100.0 end |
.drop_symbols(str) ⇒ Object
Quickly remove any symbols from a string leaving onl alpha-numeric characters and white space.
13 14 15 |
# File 'lib/string/bbstring.rb', line 13 def self.drop_symbols str str.gsub(/[^\w\s\d]|_/, '') end |
.extract_floats(str, convert: true) ⇒ Object
Extracts all integers or decimals from a string into an array.
23 24 25 |
# File 'lib/string/bbstring.rb', line 23 def self.extract_floats str, convert: true str.scan(/\d+\.?\d+|\d+/).map{ |f| convert ? f.to_f : f } end |
.extract_integers(str, convert: true) ⇒ Object
Extract all integers from a string. Use extract_floats if numbers may contain decimal places.
18 19 20 |
# File 'lib/string/bbstring.rb', line 18 def self.extract_integers str, convert: true str.scan(/\d+/).map{ |d| convert ? d.to_i : d } end |
.extract_numbers(str, convert: true) ⇒ Object
Alias for extract_floats
28 29 30 |
# File 'lib/string/bbstring.rb', line 28 def self.extract_numbers str, convert: true BBLib.extract_floats str, convert:convert end |
.from_roman(str) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/string/roman.rb', line 32 def self.from_roman str sp = str.split(' ') (0..1000).each do |n| num = BBLib.to_roman n if !sp.select{ |i| i[/#{num}/i]}.empty? for i in 0..(sp.length-1) if sp[i].drop_symbols.upcase == num sp[i].sub!(num ,n.to_s) end end end end sp.join ' ' end |
.keep_between(num, min, max) ⇒ Object
Used to keep any numeric number between a set of bounds. Passing nil as min or max represents no bounds in that direction. min and max are inclusive to the allowed bounds.
5 6 7 8 9 10 |
# File 'lib/math/bbmath.rb', line 5 def self.keep_between num, min, max raise "Argument must be numeric: #{num} (#{num.class})" unless Numeric === num if !min.nil? && num < min then num = min end if !max.nil? && num > max then num = max end return num end |
.levenshtein_distance(a, b) ⇒ Object
A simple rendition of the levenshtein distance algorithm
8 9 10 11 12 13 14 15 16 17 |
# File 'lib/string/matching.rb', line 8 def self.levenshtein_distance a, b costs = (0..b.length).to_a (1..a.length).each do |i| costs[0], nw = i, i - 1 (1..b.length).each do |j| costs[j], nw = [costs[j] + 1, costs[j-1] + 1, a[i-1] == b[j-1] ? nw : nw + 1].min, costs[j] end end costs[b.length] end |
.levenshtein_similarity(a, b) ⇒ Object
Calculates a percentage based match using the levenshtein distance algorithm
20 21 22 23 24 |
# File 'lib/string/matching.rb', line 20 def self.levenshtein_similarity a, b distance = BBLib.levenshtein_distance a, b max = [a.length, b.length].max.to_f return ((max - distance.to_f) / max) * 100.0 end |
.move_articles(str, position = :front, capitalize: true) ⇒ Object
Used to move the position of the articles ‘the’, ‘a’ and ‘an’ in strings for normalization.
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/string/bbstring.rb', line 33 def self.move_articles str, position = :front, capitalize: true return str unless [:front, :back, :none].include? position articles = ["the", "a", "an"] articles.each do |a| starts, ends = str.downcase.start_with?(a + ' '), str.downcase.end_with?(' ' + a) if starts && position != :front if position == :none str = str[(a.length + 1)..str.length] elsif position == :back str = str[(a.length + 1)..str.length] + (!ends ? ", #{capitalize ? a.capitalize : a}" : '') end end if ends && position != :back if position == :none str = str[0..-(a.length + 2)] elsif position == :front str = (!starts ? "#{capitalize ? a.capitalize : a} " : '') + str[0..-(a.length + 2)] end end end while str.strip.end_with?(',') str.strip! str.chop! end str end |
.numeric_similarity(a, b) ⇒ Object
Extracts all numbers from two strings and compares them and generates a percentage of match. Percentage calculations here need to be weighted better…TODO
54 55 56 57 58 59 60 61 62 |
# File 'lib/string/matching.rb', line 54 def self.numeric_similarity a, b a, b = a.extract_numbers, b.extract_numbers return 100.0 if a.empty? && b.empty? matches = [] for i in 0..[a.size, b.size].max-1 matches << 1.0 / ([a[i].to_f, b[i].to_f].max - [a[i].to_f, b[i].to_f].min + 1.0) end (matches.inject{ |sum, m| sum + m } / matches.size.to_f) * 100.0 end |
.parse_duration(str, output: :sec) ⇒ Object
Parses known time based patterns out of a string to construct a numeric duration.
6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/time/bbtime.rb', line 6 def self.parse_duration str, output: :sec secs = 0.0 TIME_EXPS.each do |k, v| v[:exp].each do |e| numbers = str.downcase.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}(?=\W|\d|\z)/i) numbers.each do |n| secs+= n.to_i * v[:mult] end end end secs / (TIME_EXPS[output][:mult].to_f rescue 1) end |
.parse_file_size(str, output: :byte) ⇒ Object
A file size parser for strings. Extracts any known patterns for file sizes.
34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/file/bbfile.rb', line 34 def self.parse_file_size str, output: :byte output = FILE_SIZES.keys.find{ |f| f == output || FILE_SIZES[f][:exp].include?(output.to_s.downcase) } || :byte bytes = 0.0 FILE_SIZES.each do |k, v| v[:exp].each do |e| numbers = str.scan(/(?=\w|\D|\A)\d?\.?\d+[[:space:]]*#{e}s?(?=\W|\d|\z)/i) numbers.each{ |n| bytes+= n.to_f * v[:mult] } end end return bytes / FILE_SIZES[output][:mult] end |
.phrase_similarity(a, b) ⇒ Object
Calculates a percentage based match between two strings based on the similarity of word matches.
40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/string/matching.rb', line 40 def self.phrase_similarity a, b temp = b.drop_symbols.split ' ' matches = 0 a.drop_symbols.split(' ').each do |w| if temp.include? w matches+=1 temp.delete_at temp.find_index w end end (matches.to_f / [a.split(' ').size, b.split(' ').size].max.to_f) * 100.0 end |
.qwerty_distance(a, b) ⇒ Object
A simple character distance calculator that uses qwerty key positions to determine how similar two strings are. May be useful for typo detection.
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/string/matching.rb', line 66 def self.qwerty_distance a, b a, b = a.downcase.strip, b.downcase.strip if a.length <= b.length then t = a; a = b; b = t; end qwerty = { 1 => ['1','2','3','4','5','6','7','8','9','0'], 2 => ['q','w','e','r','t','y','u','i','o','p'], 3 => ['a','s','d','f','g','h','j','k','l'], 4 => ['z','x','c','v','b','n','m'] } count, offset = 0, 0 a.chars.each do |c| if b.length <= count offset+=10 else ai = qwerty.keys.find{ |f| qwerty[f].include? c }.to_i bi = qwerty.keys.find{ |f| qwerty[f].include? b.chars[count] }.to_i offset+= (ai - bi).abs offset+= (qwerty[ai].index(c) - qwerty[bi].index(b.chars[count])).abs end count+=1 end offset end |
.scan_dir(path = Dir.pwd, filter: nil, recursive: false) ⇒ Object
Scan for files and directories. Can be set to be recursive and can also have filters applied.
6 7 8 9 10 11 12 13 |
# File 'lib/file/bbfile.rb', line 6 def self.scan_dir path = Dir.pwd, filter: nil, recursive: false if !filter.nil? filter = [filter].flatten.map{ |f| path.to_s + (recursive ? '/**/' : '/') + f.to_s } else filter = path.to_s + (recursive ? '/**/*' : '/*') end Dir.glob(filter) end |
.scan_dirs(path, filter: nil, recursive: false, mode: :path) ⇒ Object
Uses BBLib.scan_dir but returns only directories. Mode can be used to return strings (:path) or Dir objects (:dir)
21 22 23 |
# File 'lib/file/bbfile.rb', line 21 def self.scan_dirs path, filter: nil, recursive: false, mode: :path BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.directory?(f) ? (mode == :dir ? Dir.new(f) : f ) : nil}.reject{ |r| r.nil? } end |
.scan_files(path, filter: nil, recursive: false, mode: :path) ⇒ Object
Uses BBLib.scan_dir but returns only files. Mode can be used to return strings (:path) or File objects (:file)
16 17 18 |
# File 'lib/file/bbfile.rb', line 16 def self.scan_files path, filter: nil, recursive: false, mode: :path BBLib.scan_dir(path, filter: filter, recursive: recursive).map{ |f| File.file?(f) ? (mode == :file ? File.new(f) : f) : nil}.reject{ |r| r.nil? } end |
.string_to_file(path, str, mkpath = true) ⇒ Object
Shorthand method to write a string to dist. By default the path is created if it doesn’t exist.
26 27 28 29 30 31 |
# File 'lib/file/bbfile.rb', line 26 def self.string_to_file path, str, mkpath = true if !Dir.exists?(path) && mkpath FileUtils.mkpath File.dirname(path) end File.write(path, str.to_s) end |
.string_to_roman(str) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/string/roman.rb', line 19 def self.string_to_roman str sp = str.split ' ' sp.map! do |s| if s.drop_symbols.to_i.to_s == s.drop_symbols && !(s =~ /\d+\.\d+/) s.sub!(s.scan(/\d+/).first.to_s, BBLib.to_roman(s.to_i)) else s end end sp.join ' ' end |
.to_duration(num, input: :sec, stop: :milli, style: :medium) ⇒ Object
Turns a numeric input into a time string.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/time/bbtime.rb', line 20 def self.to_duration num, input: :sec, stop: :milli, style: :medium return nil unless Numeric === num || num > 0 if ![:full, :medium, :short].include?(style) then style = :medium end expression = [] n, done = num * TIME_EXPS[input.to_sym][:mult], false TIME_EXPS.reverse.each do |k, v| next unless !done div = n / v[:mult] if div > 1 expression << "#{div.floor}#{v[:styles][style]}#{div.floor > 1 && style != :short ? "s" : nil}" n-= div.floor * v[:mult] end if k == stop then done = true end end expression.join ' ' end |
.to_roman(num) ⇒ Object
Converts any integer up to 1000 to a roman numeral string_a
5 6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/string/roman.rb', line 5 def self.to_roman num return num.to_s if num > 1000 roman = {1000 => 'M', 900 => 'CM', 500 => 'D', 400 => 'CD', 100 => 'C', 90 => 'XC', 50 => 'L', 40 => 'XL', 10 => 'X', 9 => 'IX', 5 => 'V', 4 => 'IV', 3 => 'III', 2 => 'II', 1 => 'I'} numeral = "" roman.each do |n, r| while num >= n num-= n numeral+= r end end numeral end |