Module: Word2Vec

Defined in:
lib/word2vec/io.rb,
lib/word2vec/utils.rb,
lib/word2vec/version.rb,
lib/word2vec/word_vectors.rb,
lib/word2vec/word_clusters.rb,
lib/word2vec/scripts_interface.rb

Defined Under Namespace

Classes: WordClusters, WordVectors

Constant Summary collapse

VERSION =
"0.1.1"

Class Method Summary collapse

Class Method Details

.doc2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil, verbose: nil) ⇒ Object

Raises:

  • (NotImplementedError)


75
76
77
78
79
80
# File 'lib/word2vec/scripts_interface.rb', line 75

def self.doc2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5,
                 threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1,
                 cbow: 1,
                 save_vocab: nil, read_vocab: nil, verbose: nil)
  raise NotImplementedError
end

.load(fname, *args, kind: 'auto', **kwargs) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/word2vec/io.rb', line 2

def self.load(fname, *args, kind: 'auto', **kwargs)
  if kind == 'auto'
    if fname.end_with?('.bin')
      kind = 'bin'
    elsif fname.end_with?('.txt')
      kind = 'txt'
    else
      raise 'Could not identify kind'
    end
  end

  if kind == 'bin'
    Word2Vec::WordVectors.from_binary(fname, *args, **kwargs)
  elsif kind == 'txt'
    Word2Vec::WordVectors.from_text(fname, *args, **kwargs)
  elsif kind == 'mmap'
    Word2Vec::WordVectors.from_mmap(fname, *args, **kwargs)
  else
    raise 'Unknown kind'
  end
end

.load_clusters(fname) ⇒ Object



24
25
26
# File 'lib/word2vec/io.rb', line 24

def self.load_clusters(fname)
  Word2Vec::WordClusters.from_text(fname)
end

.run_cmd(command, verbose: false) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/word2vec/scripts_interface.rb', line 82

def self.run_cmd(command, verbose: false)
  p command.join(' ')
  system(command.join(' '))

  # TODO: implement it later
  # if verbose
  #   while line = stdout.readline
  #     $stdout.write(line)
  #     if line.include?('ERROR:')
  #       raise Exception(line)
  #     end
  #     $stdout.flush
  #   end
  # end
end

.word2clusters(train, output, classes, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil, verbose: false) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/word2vec/scripts_interface.rb', line 30

def self.word2clusters(train, output, classes, size: 100, window: 5, sample: '1e-3',
                       hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5,
                       alpha: 0.025, debug: 2, binary: 1, cbow: 1,
                       save_vocab: nil, read_vocab: nil, verbose: false)
  ext = File.expand_path('../../../ext/word2vec', __FILE__)
  command = [File.join(ext, 'word2vec')]

  args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
          '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
          '-binary', '-cbow', '-classes']
  values = [train, output, size, window, sample, hs, negative, threads,
            iter_, min_count, alpha, debug, binary, cbow, classes]

  args.zip(values).each do |arg, value|
    command << arg
    command << value.to_s
  end

  if save_vocab != nil
    command << '-save-vocab'
    command << save_vocab.to_s
  end
  if read_vocab != nil
    command << '-read-vocab'
    command << read_vocab.to_s
  end

  run_cmd(command, verbose: verbose)
end

.word2phrase(train, output, min_count: 5, threshold: 100, debug: 2, verbose: false) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/word2vec/scripts_interface.rb', line 60

def self.word2phrase(train, output, min_count: 5, threshold: 100, debug: 2,
                     verbose: false)
  ext = File.expand_path('../../../ext/word2vec', __FILE__)
  command = [File.join(ext, 'word2phrase')]

  args = ['-train', '-output', '-min-count', '-threshold', '-debug']
  values = [train, output, min_count, threshold, debug]
  args.zip(values).each do |arg, value|
    command << arg
    command << value.to_s
  end

  run_cmd(command, verbose: verbose)
end

.word2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil, verbose: false) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/word2vec/scripts_interface.rb', line 2

def self.word2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0,
                  negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025,
                  debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil,
                  verbose: false)
  ext = File.expand_path('../../../ext/word2vec', __FILE__)
  command = [File.join(ext, 'word2vec')]
  args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
          '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
          '-binary', '-cbow']
  values = [train, output, size, window, sample, hs, negative, threads,
            iter_, min_count, alpha, debug, binary, cbow]

  args.zip(values).each do |arg, value|
    command << arg
    command << value.to_s
  end
  if save_vocab != nil
    command << '-save-vocab'
    command << save_vocab.to_s
  end
  if read_vocab != nil
    command << '-read-vocab'
    command << read_vocab.to_s
  end

  run_cmd(command, verbose: verbose)
end