Class: VectorEmbed::Maker::Ngram

Inherits:
VectorEmbed::Maker show all
Defined in:
lib/vector_embed/maker/ngram.rb

Constant Summary collapse

IM_AN_NGRAM =

TODO make sure you can’t collide with these

'ngram'

Instance Attribute Summary collapse

Attributes inherited from VectorEmbed::Maker

#cardinality, #k, #options, #parent

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from VectorEmbed::Maker

pick

Constructor Details

#initialize(k, parent, options = nil) ⇒ Ngram

Returns a new instance of Ngram.

Raises:



18
19
20
21
22
23
# File 'lib/vector_embed/maker/ngram.rb', line 18

def initialize(k, parent, options = nil)
  super
  @len = parent.options[:ngram_len].to_i
  raise ArgumentError, ":ngram_len must be > 0" unless @len > 0
  @delim = parent.options[:ngram_delim]
end

Instance Attribute Details

#delimObject (readonly)

Returns the value of attribute delim.



16
17
18
# File 'lib/vector_embed/maker/ngram.rb', line 16

def delim
  @delim
end

#lenObject (readonly)

Returns the value of attribute len.



15
16
17
# File 'lib/vector_embed/maker/ngram.rb', line 15

def len
  @len
end

Class Method Details

.want?(v, parent) ⇒ Boolean

Returns:



7
8
9
# File 'lib/vector_embed/maker/ngram.rb', line 7

def want?(v, parent)
  parent.options[:ngram_len]
end

Instance Method Details

#pairs(v) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/vector_embed/maker/ngram.rb', line 25

def pairs(v)
  raise "Ngram can't handle #{v.inspect}, only a single string for now" unless v.is_a?(::String)
  v = parent.preprocess v.to_s
  if len == 1
    # word mode
    v.split delim
  elsif delim == ''
    # byte mode
    (0..v.length-len).map { |i| v[i,len] }
  else
    raise "Word n-gram not supported yet"
  end.map do |ngram|
    [ parent.index([k, IM_AN_NGRAM, ngram]), 1 ]
  end
end