Class: Maxixe::Segmenter

Inherits:
Object
  • Object
show all
Defined in:
lib/maxixe.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(index, t = 0.5) ⇒ Segmenter

Returns a new instance of Segmenter.



7
8
9
10
11
# File 'lib/maxixe.rb', line 7

def initialize(index, t = 0.5)
  @index = index
  @n = index.keys.map(&:to_i)
  @t = t
end

Instance Attribute Details

#tObject

Returns the value of attribute t.



5
6
7
# File 'lib/maxixe.rb', line 5

def t
  @t
end

Instance Method Details

#all_n_grams(str) ⇒ Object



47
48
49
# File 'lib/maxixe.rb', line 47

def all_n_grams str
  @n.map do |n| str.each_char.each_cons(n).to_a end
end

#average_votes(votes) ⇒ Object



92
93
94
95
96
# File 'lib/maxixe.rb', line 92

def average_votes(votes)
  votes.transpose.map do |vote_array|
    vote_array.inject(&:+).to_f / vote_array.size
  end
end

#compute_vote(non_strad, strad, n) ⇒ Object



83
84
85
86
87
88
89
90
# File 'lib/maxixe.rb', line 83

def compute_vote(non_strad, strad, n)
  res = non_strad.inject(0) do |res, s|
    res + strad.inject(0) do |res_2, t|
      res_2 + ((token_count(s) > token_count(t)) ? 1 : 0)
    end
  end 
  res / (2.0 * (n - 1))
end

#compute_votes(positions_with_ngrams, n) ⇒ Object



77
78
79
80
81
# File 'lib/maxixe.rb', line 77

def compute_votes positions_with_ngrams, n
  positions_with_ngrams.map do |(non_strad, strad)|
    compute_vote(non_strad, strad, n)
  end
end

#non_straddling(n_grams, pos) ⇒ Object



61
62
63
64
65
66
67
# File 'lib/maxixe.rb', line 61

def non_straddling n_grams, pos
  res = []
  n_grams.each_with_index do |n_gram, i|
    res << n_gram if i == pos + 1 or i == pos - (n_gram.size - 1)
  end
  res.map(&:join)
end

#segment(str, t = nil) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/maxixe.rb', line 13

def segment(str, t = nil)

  n_grams = all_n_grams(str)
  
  votes_for_all = n_grams.map{|n| compute_votes(straddling_and_non_straddling(n,str), n.first.size)}

  averaged = average_votes(votes_for_all)

  split_with_votes(averaged, str, t)

end

#split_with_votes(votes, str, t = nil) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/maxixe.rb', line 25

def split_with_votes(votes, str, t = nil)
  points = []
  votes.each_with_index do |vote, i|
    treshold = vote > (t || @t)
    maximum = if i > 0 and i < (votes.size - 1)
      vote > votes[i - 1] and vote > votes[i + 1]
    else false end 

    points << i if treshold or maximum
  end 

  res = str.dup
  offset = 1
  points.each do |p|
    res.insert(p + offset, " ")
    offset += 1
  end

  res 

end

#straddling(n_grams, pos) ⇒ Object



69
70
71
72
73
74
75
# File 'lib/maxixe.rb', line 69

def straddling n_grams, pos
  res = []
  n_grams.each_with_index do |n_gram, i|
    res << n_gram if i <= pos and i > pos - (n_gram.size - 1)
  end
  res.map(&:join)
end

#straddling_and_non_straddling(n_grams, str) ⇒ Object



55
56
57
58
59
# File 'lib/maxixe.rb', line 55

def straddling_and_non_straddling n_grams, str
  (0..(str.length - 2)).map do |pos|
    [non_straddling(n_grams, pos), straddling(n_grams, pos)]
  end
end

#token_count(n_gram) ⇒ Object



51
52
53
# File 'lib/maxixe.rb', line 51

def token_count(n_gram)
  @index[n_gram.length.to_s][n_gram] || 0
end