Class: VectorEmbed

Inherits:
Object
  • Object
show all
Defined in:
lib/vector_embed.rb,
lib/vector_embed/maker.rb,
lib/vector_embed/version.rb,
lib/vector_embed/stop_word.rb,
lib/vector_embed/maker/date.rb,
lib/vector_embed/maker/ngram.rb,
lib/vector_embed/maker/number.rb,
lib/vector_embed/maker/phrase.rb,
lib/vector_embed/maker/boolean.rb

Defined Under Namespace

Classes: Maker, StopWord

Constant Summary collapse

JUST_A_NUMBER =
/\A\s*[+\-]?(?:0|[1-9]\d*)(?:\.\d*)?(?:[eE][+\-]?\d+)?\s*\z/
BLANK =
/\A\s*\z/
NULL =
/\Anull\z/i
SLASH_N =
'\N'
TRUE =
/\Atrue\z/i
T =
/\At\z/i
FALSE =
/\Afalse\z/i
F =
/\Af\z/i
NULL_BYTE =
"\x00"
LABEL_MAKERS =
[Maker::Boolean, Maker::Number]
FEATURE_MAKERS =
[Maker::Boolean, Maker::Date, Maker::Number, Maker::Ngram, Maker::Phrase]
VERSION =
'0.3.3'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ VectorEmbed

Returns a new instance of VectorEmbed.



28
29
30
31
32
33
34
35
36
# File 'lib/vector_embed.rb', line 28

def initialize(options = {})
  @options = options.dup
  @mutex = Mutex.new
  @feature_makers = {}
  @logger = options[:logger] || (l = Logger.new($stderr); l.level = Logger::INFO; l)
  if dict = @options.delete(:dict)
    @dict = dict.dup
  end
end

Instance Attribute Details

#dictObject (readonly)

Returns the value of attribute dict.



26
27
28
# File 'lib/vector_embed.rb', line 26

def dict
  @dict
end

#loggerObject

Returns the value of attribute logger.



25
26
27
# File 'lib/vector_embed.rb', line 25

def logger
  @logger
end

#optionsObject (readonly)

Returns the value of attribute options.



24
25
26
# File 'lib/vector_embed.rb', line 24

def options
  @options
end

Instance Method Details

#index(parts) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
# File 'lib/vector_embed.rb', line 61

def index(parts)
  k = parts.join NULL_BYTE
  if dict
    k = Digest::MD5.digest k
    dict[k] || @mutex.synchronize do
      dict[k] ||= dict.length + 1
    end
  else
    MurmurHash3::V32.str_hash(k).to_s[0..6].to_i
  end
end

#line(label, features = {}) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/vector_embed.rb', line 38

def line(label, features = {})
  feature_pairs = features.inject([]) do |memo, (k, v)|
    case v
    when Array
      v.each_with_index do |vv, i|
        memo.concat feature_maker([k, i].join(NULL_BYTE), vv).pairs(vv)
      end
    else
      memo.concat feature_maker(k, v).pairs(v)
    end
    memo
  end.compact.sort_by do |k_value, _|
    k_value
  end.map do |pair|
    pair.join ':'
  end
  ([label_maker(label).value(label)] + feature_pairs).join ' '
end

#preprocess(v) ⇒ Object



57
58
59
# File 'lib/vector_embed.rb', line 57

def preprocess(v)
  StopWord.remove stop_words, v
end