Top Level Namespace

Defined Under Namespace

Modules: BLEU, DAG, HG, TFIDF Classes: Array, BooleanSemiring, CountingSemiring, InsideSemiring, Integer, PriorityQueue, ReadFile, RealSemiring, RealxSemiring, Semiring, SparseVector, String, Translation, ViterbiLogSemiring, ViterbiSemiring, WriteFile

Instance Method Summary collapse

Instance Method Details

#bag_of_words(s, stopwords = []) ⇒ Object



15
16
17
# File 'lib/zipf/stringutil.rb', line 15

def bag_of_words s, stopwords=[]
  s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
end

#cdec_kbest(cdec_bin, input, ini, weights, k, unique = true) ⇒ Object



89
90
91
92
93
94
95
96
97
# File 'lib/zipf/misc.rb', line 89

def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
  require 'open3'
  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
  cmd += " -r" if unique
  o,_ = Open3.capture2 "#{cmd}  2>/dev/null"
  a = []; j = -1
  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
  return a
end

#ngrams(s, n, fix = false) ⇒ Object



5
6
7
8
9
10
11
12
13
# File 'lib/zipf/stringutil.rb', line 5

def ngrams(s, n, fix=false)
  a = tokenize s
  a.each_with_index { |tok, i|
    tok.strip!
    0.upto([n-1, a.size-i-1].min) { |m|
      yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
    }
  }
end

#read_config(fn) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/zipf/misc.rb', line 99

def read_config fn
  f = ReadFile.new fn
  cfg = {}
  while line = f.gets
    line.strip!
    next if /^\s*$/.match line
    next if line[0]=='#'
    content = line.split('#', 2).first
    k, v = content.split(/\s*=\s*/, 2)
    k.strip!; v.strip!
    cfg[k] = v
  end
  return cfg
end

#read_kbest_lists(fn, translation_type = Translation) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/zipf/Translation.rb', line 45

def read_kbest_lists fn, translation_type=Translation
  kbest_lists = []
  cur = []
  f = ReadFile.new fn
  prev = -1
  c = 0
  id = 0
  while line = f.gets
    t = translation_type.new
    t.from_s line
    c = splitpipe(line)[0].to_i
    if c != prev
      if cur.size > 0
        kbest_lists << cur
        cur = []
      end
      prev = c
      id = 0
    end
    t.id = id
    cur << t
    id += 1
  end
  kbest_lists << cur # last one
  f.close
  return kbest_lists
end

#read_phrase_table(fn) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/zipf/misc.rb', line 73

def read_phrase_table fn
  table = {}
  f = ReadFile.new fn
  while raw_rule = f.gets
    french, english, features = splitpipe(raw_rule)
    feature_map = SparseVector.from_kv  features
    if table.has_key? french
      table[french] << [english, feature_map ]
    else
      table[french] = [[english, feature_map]]
    end
  end
  f.close
  return table
end

#spawn_with_timeout(cmd, t = 4, ignore_fail = false, debug = false) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/zipf/misc.rb', line 60

def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
  STDERR.write cmd+"\n" if debug
  pipe_in, pipe_out = IO.pipe
  pid = Process.spawn(cmd, :out => pipe_out)
  begin
    Timeout.timeout(t) { Process.wait pid }
  rescue Timeout::Error
    Process.kill('TERM', pid) if !ignore_fail
  end
  pipe_out.close
  return pipe_in.read
end

#splitpipe(s, n = 3) ⇒ Object



19
20
21
# File 'lib/zipf/stringutil.rb', line 19

def splitpipe s, n=3
  s.strip.split("|"*n)
end

#tokenize(s) ⇒ Object



1
2
3
# File 'lib/zipf/stringutil.rb', line 1

def tokenize s
  s.strip.split
end