Module: TextAlignment

Defined in:
lib/text_alignment/constants.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/char_mapping.rb,
lib/text_alignment/anchor_finder.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/cultivation_map.rb,
lib/text_alignment/mixed_alignment.rb,
lib/text_alignment/glcs_alignment_fast.rb

Defined Under Namespace

Classes: AnchorFinder, CharMapping, CultivationMap, GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, MixedAlignment, TextAlignment

Constant Summary collapse

SIZE_NGRAM =
8
SIZE_WINDOW =
10
BUFFER_RATE =
0.1
BUFFER_MIN =
20
TEXT_SIMILARITY_THRESHOLD =
0.9
NIL_CHARACTER =
'_'
CHAR_MAPPING =
[
  ["©", "(c)"],      #U+00A9 (Copyright Sign)

  ["α", "alpha"],    #U+03B1 (greek small letter alpha)
  ["β", "beta"],   #U+03B2 (greek small letter beta)
  ["γ", "gamma"],    #U+03B3 (greek small letter gamma)
  ["δ", "delta"],    #U+03B4 (greek small letter delta)
  ["ε", "epsilon"],  #U+03B5 (greek small letter epsilon)
  ["ζ", "zeta"],   #U+03B6 (greek small letter zeta)
  ["η", "eta"],      #U+03B7 (greek small letter eta)
  ["θ", "theta"],    #U+03B7 (greek small letter eta)
  ["ι", "iota"],   #U+03B7 (greek small letter eta)
  ["κ", "kappa"],    #U+03BA (greek small letter kappa)
  ["λ", "lambda"], #U+03BB (greek small letter lambda)
  ["λ", "lamda"],    #U+03BB (greek small letter lambda)
  ["μ", "mu"],     #U+03BC (greek small letter mu)
  ["ν", "nu"],     #U+03BD (greek small letter nu)
  ["ξ", "xi"],     #U+03BE (greek small letter xi)
  ["ο", "omicron"],  #U+03BF (greek small letter omicron)
  ["π", "pi"],     #U+03C0 (greek small letter pi)
  ["ρ", "rho"],      #U+03C1 (greek small letter rho)
  ["σ", "sigma"],    #U+03C3 (greek small letter sigma)
  ["τ", "tau"],      #U+03C4 (greek small letter tau)
  ["υ", "upsilon"],  #U+03C5 (greek small letter upsilon)
  ["φ", "phi"],      #U+03C6 (greek small letter phi)
  ["χ", "chi"],      #U+03C7 (greek small letter chi)
  ["ψ", "psi"],      #U+03C8 (greek small letter psi)
  ["ω", "omega"],    #U+03C9 (greek small letter omega)

  ["Α", "Alpha"],    #U+0391 (greek capital letter alpha)
  ["Β", "Beta"],   #U+0392 (greek capital letter beta)
  ["Γ", "Gamma"],    #U+0393 (greek capital letter gamma)
  ["Δ", "Delta"],    #U+0394 (greek capital letter delta)
  ["Ε", "Epsilon"],  #U+0395 (greek capital letter epsilon)
  ["Ζ", "Zeta"],   #U+0396 (greek capital letter zeta)
  ["Η", "Eta"],      #U+0397 (greek capital letter eta)
  ["Θ", "Theta"],    #U+0398 (greek capital letter theta)
  ["Ι", "Iota"],   #U+0399 (greek capital letter iota)
  ["Κ", "Kappa"],    #U+039A (greek capital letter kappa)
  ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
  ["Λ", "Lamda"],    #U+039B (greek capital letter lambda)
  ["Μ", "Mu"],     #U+039C (greek capital letter mu)
  ["Ν", "Nu"],     #U+039D (greek capital letter nu)
  ["Ξ", "Xi"],     #U+039E (greek capital letter xi)
  ["Ο", "Omicron"],  #U+039F (greek capital letter omicron)
  ["Π", "Pi"],     #U+03A0 (greek capital letter pi)
  ["Ρ", "Rho"],      #U+03A1 (greek capital letter rho)
  ["Σ", "Sigma"],    #U+03A3 (greek capital letter sigma)
  ["Τ", "Tau"],      #U+03A4 (greek capital letter tau)
  ["Υ", "Upsilon"],  #U+03A5 (greek capital letter upsilon)
  ["Φ", "Phi"],      #U+03A6 (greek capital letter phi)
  ["Χ", "Chi"],      #U+03A7 (greek capital letter chi)
  ["Ψ", "Psi"],      #U+03A8 (greek capital letter Psi)
  ["Ω", "Omega"],    #U+03A9 (greek capital letter omega)

  ["ϕ", "phi"],      #U+03D5 (greek phi symbol)

  ["×", "x"],        #U+00D7 (multiplication sign)
  ["•", "*"],        #U+2022 (bullet)
  [" ", " "],        #U+2009 (thin space)
  [" ", " "],        #U+200A (hair space)
  [" ", " "],        #U+202F (narrow no-break space)
  [" ", " "],        #U+00A0 (Non-Breaking space)
  [" ", " "],        #U+3000 (ideographic space)
  ["‐", "-"],        #U+2010 (Hyphen)
  ["‑", "-"],        #U+2011 (Non-Breaking Hyphen)
  ["−", "-"],        #U+2212 (minus sign)
  ["–", "-"],        #U+2013 (en dash)
  ["′", "'"],        #U+2032 (prime)
  ["‘", "'"],        #U+2018 (left single quotation mark)
  ["’", "'"],        #U+2019 (right single quotation mark)
  ["“", '"'],        #U+201C (left double quotation mark)
  ["”", '"'],        #U+201D (right double quotation mark)
  ['"', "''"]
]
SIMILARITY_THRESHOLD =

to work on the hash representation of denotations to assume that there is no bag representation to this method

0.7
MIN_LENGTH_FOR_APPROXIMATION =

approximate the location of str1 in str2

50

Class Method Summary collapse

Class Method Details

._find_divisions(_source, _targets) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/text_alignment/find_divisions.rb', line 35

def _find_divisions(_source, _targets)
  indice = []
  history = []
  cache = {}
  source = _source.dup
  targets = _targets.dup
  until source.strip.empty? || targets.empty?
    mode, cmp = nil, nil
    candidates = []
    targets.each_with_index do |target, i|
      if source.size < target[:text].size
        mode = :t_in_s
        str1 = source
        str2 = target[:text]
      else
        mode = :s_in_t
        str1 = target[:text]
        str2 = source
      end

      len1 = str1.length
      len2 = str2.length

      offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
        approximate_fit(str1, str2)
      else
        # the whole source
        [0, -1]
      end

      unless offset_begin.nil?
        key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
        cmp = if cache.has_key? key
          cache[key]
        else
          cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
        end
        cache[key] = cmp

        if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
          candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
        end
      end
    end

    # return remaining source and targets if m.nil?
    break if candidates.empty?

    choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
    m = choice[:idx]
    mode = choice[:mode]

    index = if mode == :t_in_s
      {divid:targets[m][:divid], region:[0, source.size]}
    else # :s_in_t
      cmp = choice[:cmp]
      offset = choice[:offset]
      {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
    end

    source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
    history << index[:region].dup

    before_begin = index[:region][0]
    before_end = index[:region][1]

    rhistory = history.reverse
    rhistory.shift
    rhistory.each do |h|
      gap = h[1] - h[0]
      index[:region][0] += gap if index[:region][0] >= h[0]
      index[:region][1] += gap if index[:region][1] >  h[0]
    end

    indice << index

    targets.delete_at(m)
  end

  unless source.strip.empty? && targets.empty?
    index = {divid:nil}
    index[:remaining_source] = source unless source.strip.empty?
    index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
    indice << index
  end

  indice
end

._find_divisions_old(source, targets) ⇒ Object



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/text_alignment/find_divisions.rb', line 124

def _find_divisions_old(source, targets)
  mode, m, c, offset_begin = nil, nil, nil, nil

  targets.each_with_index do |target, i|
    if source.size < target[:text].size
      mode = :t_in_s
      str1 = source
      str2 = target[:text]
    else
      mode = :s_in_t
      str1 = target[:text]
      str2 = source
    end

    len1 = str1.length
    len2 = str2.length

    offset_begin, offset_end = 0, -1
    offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

    unless offset_begin.nil?
      c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
      if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
        m = i
        break
      end
    end
  end

  # return remaining source and targets if m.nil?
  return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?

  index = if mode == :t_in_s
    [targets[m][:divid], [0, source.size]]
  else # :s_in_t
    [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
  end

  next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
  targets.delete_at(m)

  if next_source.strip.empty? || targets.empty?
    return [index]
  else
    more_index = _find_divisions(next_source, targets)
    gap = index[1][1] - index[1][0]
    more_index.each do |i|
      if (i[0] > -1)
        i[1][0] += gap if i[1][0] >= index[1][0]
        i[1][1] += gap if i[1][1] >  index[1][0]
      end
    end
    return [index] + more_index
  end
end

.approximate_fit(str1, str2) ⇒ Object

If finds an approximate region of str2 that contains str1

Raises:

  • (ArgumentError)


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/text_alignment/approximate_fit.rb', line 13

def approximate_fit(str1, str2)
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

  ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
  ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
  ngram_shared = ngram1 & ngram2

  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
  return nil, nil if ngram_shared.empty?

  signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
  return nil, nil if signature_ngrams.empty? #raise "no signature ngram"

  cache = {}
  fit_begin, fit_end = nil, nil
  signature_ngrams.each do |signature_ngram|
    loc_signature_ngram_in_str1 = str1.index(signature_ngram)
    loc_signature_ngram_in_str2 = str2.index(signature_ngram)

    # approximate the beginning of the fit
    fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
    fit_begin = 0 if fit_begin < 0

    # approximate the end of the fit
    offset_end = str1.length - loc_signature_ngram_in_str1
    fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
    fit_end = str2.length if fit_end > str2.length

    next if cache.has_key?("#{fit_begin}-#{fit_end}")
    text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
    cache["#{fit_begin}-#{fit_end}"] = text_similarity

    break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
    fit_begin, fit_end = nil, nil
  end
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
  return nil, nil
end

.cdiff(str1, str2) ⇒ Object

Raises:

  • (ArgumentError)


10
11
12
13
14
# File 'lib/text_alignment/lcs_cdiff.rb', line 10

def cdiff(str1, str2)
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
  raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
  sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end

.find_divisions(source, targets, mappings = []) ⇒ Object

It finds, among the targets, the right divisions for the taraget text to fit in.

Raises:

  • (ArgumentError)


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/text_alignment/find_divisions.rb', line 15

def find_divisions(source, targets, mappings = [])
  raise ArgumentError, "nil source"           if source == nil
  raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
  raise ArgumentError, "nil mappings"         if mappings == nil

  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  source.tr!(characters_from, characters_to)
  targets.each{|target| target[:text].tr!(characters_from, characters_to)}

  # to process smaller ones first
  targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

  TextAlignment._find_divisions(source, targets)
end

.glcs_required?(str1, mappings = []) ⇒ Boolean

Returns:

  • (Boolean)

Raises:

  • (ArgumentError)


5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/text_alignment/glcs_required.rb', line 5

def glcs_required?(str1, mappings = [])
  raise ArgumentError, "nil string" if str1.nil?
  raise ArgumentError, "nil mappings" if mappings.nil?

  # character mappings can be safely applied to the strings withoug changing the position of other characters
  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  str1.tr!(characters_from, characters_to)

  str1 =~/([^\p{ASCII}][^\p{ASCII}])/
  $1
end

.sdiff2cdiff(sdiff) ⇒ Object

Raises:

  • (ArgumentError)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/text_alignment/lcs_cdiff.rb', line 16

def sdiff2cdiff (sdiff)
  raise ArgumentError, "nil sdiff" if sdiff.nil?

  cdiff_str1, cdiff_str2 = '', ''

  sdiff.each do |h|
    case h.action
    when '='
      cdiff_str1 += h.old_element
      cdiff_str2 += h.new_element
    when '!'
      cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
      cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
    when '-'
      cdiff_str1 += h.old_element
      cdiff_str2 += TextAlignment::NIL_CHARACTER
    when '+'
      cdiff_str1 += TextAlignment::NIL_CHARACTER
      cdiff_str2 += h.new_element
    end
  end

  cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
end