Class: Word2Vec::WordVectors

Inherits:
Object
  • Object
show all
Defined in:
lib/word2vec/utils.rb,
lib/word2vec/word_vectors.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(vocab:, vectors:, clusters: nil) ⇒ WordVectors

Returns a new instance of WordVectors.



9
10
11
12
13
14
15
16
17
18
# File 'lib/word2vec/word_vectors.rb', line 9

def initialize(vocab:, vectors:, clusters: nil)
  @vocab = vocab
  @vectors = vectors
  @clusters = clusters

  @vocab_hash = {}
  vocab.each_with_index do |word, i|
    @vocab_hash[word] = i
  end
end

Instance Attribute Details

#clustersObject

Returns the value of attribute clusters.



7
8
9
# File 'lib/word2vec/word_vectors.rb', line 7

def clusters
  @clusters
end

#vectorsObject

Returns the value of attribute vectors.



7
8
9
# File 'lib/word2vec/word_vectors.rb', line 7

def vectors
  @vectors
end

#vocabObject

Returns the value of attribute vocab.



7
8
9
# File 'lib/word2vec/word_vectors.rb', line 7

def vocab
  @vocab
end

#vocab_hashObject

Returns the value of attribute vocab_hash.



7
8
9
# File 'lib/word2vec/word_vectors.rb', line 7

def vocab_hash
  @vocab_hash
end

Class Method Details

.from_binary(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8") ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/word2vec/word_vectors.rb', line 89

def self.from_binary(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
  vocab = nil
  vectors = nil

  File.open(fname, 'rb') do |fin|
    header = fin.readline
    vocab_size, vector_size = header.split.map(&:to_i)

    # TODO: replace numpy with nmatrix
    # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
    # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
    # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
    # binary_len = numpy.dtype(np.float32).itemsize * vector_size

    vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
    vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a
    binary_len = 4 * vector_size # need to calculate from a data type

    vocab_size.times do |i|
      word = ''
      while true
        ch = fin.read(1)
        if ch == ' '
          break
        end
        word += ch
      end
      inklude = desired_vocab == nil || desired_vocab.include?(word)
      if inklude
        vocab[i] = word.force_encoding(encoding)
      end

      # read vector
      vector = NMatrix[*fin.read(binary_len).unpack('f*'), dtype: :float32].to_a
      if inklude
        vectors[i] = unitvec(vector)
      end
      fin.read(1) # newline
    end

    if desired_vocab != nil
      indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
      vectors = vectors.values_at(*indices)
      vocab = vocab.values_at(*indices)
    end
  end

  self.new(vocab: vocab, vectors: vectors)
end

.from_mmap(fname) ⇒ Object

Raises:

  • (NotImplementedError)


178
179
180
# File 'lib/word2vec/word_vectors.rb', line 178

def self.from_mmap(fname)
  raise NotImplementedError
end

.from_text(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8") ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/word2vec/word_vectors.rb', line 139

def self.from_text(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
  vocab = nil
  vectors = nil

  File.open(fname, 'rb') do |fin|
    header = fin.readline
    vocab_size, vector_size = header.split.map(&:to_i)

    # TODO: replace numpy with nmatrix
    # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
    # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
    # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
    # binary_len = numpy.dtype(np.float32).itemsize * vector_size

    vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
    vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a

    fin.each_line.with_index do |line, i|
      line = line.force_encoding(encoding).strip
      parts = line.split(" ")
      word = parts[0]
      inklude = desired_vocab == nil || desired_vocab.include?(word)
      if inklude
        vector = parts[1..-1].map(&:to_f)
        vocab[i] = word
        vectors[i] = unitvec(vector)
      end
    end

    if desired_vocab != nil
      indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
      vectors = vectors.values_at(*indices)
      vocab = vocab.values_at(*indices)
    end
  end

  self.new(vocab: vocab, vectors: vectors)
end

.unitvec(vec) ⇒ Object



5
6
7
# File 'lib/word2vec/utils.rb', line 5

def self.unitvec(vec)
  (NMatrix[*vec] * (1.0 / NMatrix[*vec].norm2)).to_a
end

Instance Method Details

#[](word) ⇒ Object



28
29
30
# File 'lib/word2vec/word_vectors.rb', line 28

def [](word)
  self.get_vector(word)
end

#analogy(pos:, neg:, n: 10) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/word2vec/word_vectors.rb', line 48

def analogy(pos:, neg:, n: 10)
  exclude = pos + neg
  pos = pos.map { |word| [word, 1.0] }
  neg = neg.map { |word| [word, -1.0] }

  mean = []
  (pos + neg).each do |word, direction|
    mean << (NMatrix[*self[word], dtype: :float32] * direction).to_a
  end
  mean = NMatrix[*mean, dtype: :float32].mean

  metrics = NMatrix[*self.vectors, dtype: :float32].dot(mean.transpose)
  best = metrics.sorted_indices.reverse[0...(n + exclude.size)]

  exclude_idx = []
  exclude.each do |word|
    if best.include?(self.ix(word))
      exclude_idx << best.each_index.select { |i| best[i] == self.ix(word) }
    end
  end
  exclude_idx.flatten.uniq.each do |index|
    best.delete_at(index)
  end
  new_best = best
  best_metrics = metrics.to_a.flatten.values_at(*new_best)
  [new_best[0...n], best_metrics[0...n]]
end

#cosine(word, n: 10) ⇒ Object



41
42
43
44
45
46
# File 'lib/word2vec/word_vectors.rb', line 41

def cosine(word, n: 10)
  metrics = NMatrix[*self.vectors, dtype: :float32].dot(NMatrix[self[word], dtype: :float32].transpose)
  best = metrics.sorted_indices.reverse[1..n]
  best_metrics = metrics.to_a.values_at(*best).flatten
  [best, best_metrics]
end

#generate_response(indices, metrics, clusters: true) ⇒ Object



76
77
78
79
80
81
82
83
# File 'lib/word2vec/word_vectors.rb', line 76

def generate_response(indices, metrics, clusters: true)
  if self.clusters && clusters
    self.vocab.values_at(*indices)
      .zip(metrics, self.clusters.clusters.values_at(*indices))
  else
    self.vocab.values_at(*indices).zip(metrics)
  end
end

#get_vector(word) ⇒ Object



36
37
38
39
# File 'lib/word2vec/word_vectors.rb', line 36

def get_vector(word)
  idx = self.ix(word)
  self.vectors[idx]
end

#include?(word) ⇒ Boolean

Returns:

  • (Boolean)

Raises:

  • (NotImplementedError)


32
33
34
# File 'lib/word2vec/word_vectors.rb', line 32

def include?(word)
  raise NotImplementedError
end

#ix(word) ⇒ Object



20
21
22
# File 'lib/word2vec/word_vectors.rb', line 20

def ix(word)
  self.vocab_hash[word]
end

#to_mmap(fname) ⇒ Object

Raises:

  • (NotImplementedError)


85
86
87
# File 'lib/word2vec/word_vectors.rb', line 85

def to_mmap(fname)
  raise NotImplementedError
end

#word(ix) ⇒ Object



24
25
26
# File 'lib/word2vec/word_vectors.rb', line 24

def word(ix)
  self.vocab[ix]
end