Class: VectorEmbed

Inherits:
Object
  • Object
show all
Defined in:
lib/vector_embed.rb,
lib/vector_embed/maker.rb,
lib/vector_embed/version.rb,
lib/vector_embed/stop_word.rb,
lib/vector_embed/maker/date.rb,
lib/vector_embed/maker/ngram.rb,
lib/vector_embed/maker/number.rb,
lib/vector_embed/maker/phrase.rb,
lib/vector_embed/maker/boolean.rb

Defined Under Namespace

Classes: Maker, StopWord

Constant Summary collapse

JUST_A_NUMBER =
/\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
UGLY_FLOAT =
/\A\.\d+\z/
BLANK =
/\A\s*\z/
NULL =
/\Anull\z/i
SLASH_N =
'\N'
TRUE =
/\Atrue\z/i
T =
/\At\z/i
FALSE =
/\Afalse\z/i
F =
/\Af\z/i
NULL_BYTE =
"\x00"
LABEL_MAKERS =
[Maker::Boolean, Maker::Number]
FEATURE_MAKERS =
[Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
VERSION =
'0.5.2'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ VectorEmbed

Returns a new instance of VectorEmbed.



29
30
31
32
33
34
35
36
37
# File 'lib/vector_embed.rb', line 29

def initialize(options = {})
  @options = options.dup
  @mutex = Mutex.new
  @feature_makers = {}
  @logger = options[:logger] || (l = Logger.new($stderr); l.level = (ENV['VERBOSE'] == 'true') ? Logger::DEBUG : Logger::INFO; l)
  if dict = @options.delete(:dict)
    @dict = dict.dup
  end
end

Instance Attribute Details

#dictObject (readonly)

Returns the value of attribute dict.



26
27
28
# File 'lib/vector_embed.rb', line 26

def dict
  @dict
end

#loggerObject

Returns the value of attribute logger.



25
26
27
# File 'lib/vector_embed.rb', line 25

def logger
  @logger
end

#optionsObject (readonly)

Returns the value of attribute options.



27
28
29
# File 'lib/vector_embed.rb', line 27

def options
  @options
end

Instance Method Details

#index(parts) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/vector_embed.rb', line 62

def index(parts)
  sig = parts.join NULL_BYTE
  if dict
    sig = Digest::MD5.digest sig
    dict[sig] || @mutex.synchronize do
      dict[sig] ||= begin
        k = parts[0]
        @feature_makers[k].cardinality += 1
        dict[sig] = dict.length + 1
      end
    end
  else
    MurmurHash3::V32.str_hash(sig).to_s[0..6].to_i
  end
end

#line(label, features = {}) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/vector_embed.rb', line 39

def line(label, features = {})
  feature_pairs = features.inject([]) do |memo, (k, v)|
    case v
    when Array
      v.each_with_index do |vv, i|
        memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
      end
    else
      memo.concat feature_maker(k, v).pairs(v)
    end
    memo
  end.compact.sort_by do |k_value, _|
    k_value
  end.map do |pair|
    pair.join ':'
  end
  ([label_maker(label).value(label)] + feature_pairs).join ' '
end

#preprocess(v) ⇒ Object



58
59
60
# File 'lib/vector_embed.rb', line 58

def preprocess(v)
  StopWord.remove stop_words, v
end

#stats_reportObject



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/vector_embed.rb', line 78

def stats_report
  report = @feature_makers.map do |feature, maker|
    [feature, maker.class, maker.cardinality]
  end
  total_cardinality = report.inject(0) { |sum, row| sum += row[2]; sum }

  report.unshift %w{ Feature Class Cardinality }
  feature_width     = report.map { |row| row[0].to_s.length }.max
  class_width       = report.map { |row| row[1].to_s.length }.max
  cardinality_width = report.map { |row| row[2].to_s.length }.max

  report = report.map do |row|
    [
      row[0].to_s.ljust(feature_width),
      row[1].to_s.ljust(class_width),
      row[2].to_s.rjust(cardinality_width),
    ].join(' | ')
  end
  total_width = report.first.length
  report.insert(1, ''.ljust(total_width, '-'))
  report.push(total_cardinality.to_s.rjust(total_width))
  report.push('').join("\n")
end