Module: TextAlignment

Defined in:
lib/text_alignment/find_divisions.rb,
lib/text_alignment/mappings.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/glcs_alignment_fast.rb

Overview

approximate the location of str1 in str2

Defined Under Namespace

Classes: GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, TextAlignment

Constant Summary collapse

MAPPINGS =
[
  ["©", "(c)"],   #U+00A9 (Copyright Sign)

  ["α", "alpha"],   #U+03B1 (greek small letter alpha)
  ["β", "beta"],    #U+03B2 (greek small letter beta)
  ["γ", "gamma"],   #U+03B3 (greek small letter gamma)
  ["δ", "delta"],   #U+03B4 (greek small letter delta)
  ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
  ["ζ", "zeta"],    #U+03B6 (greek small letter zeta)
  ["η", "eta"],     #U+03B7 (greek small letter eta)
  ["θ", "theta"],   #U+03B7 (greek small letter eta)
  ["ι", "iota"],    #U+03B7 (greek small letter eta)
  ["κ", "kappa"],   #U+03BA (greek small letter kappa)
  ["λ", "lambda"],  #U+03BB (greek small letter lambda)
  ["λ", "lamda"],  #U+03BB (greek small letter lambda)
  ["μ", "mu"],      #U+03BC (greek small letter mu)
  ["ν", "nu"],      #U+03BD (greek small letter nu)
  ["ξ", "xi"],      #U+03BE (greek small letter xi)
  ["ο", "omicron"], #U+03BF (greek small letter omicron)
  ["π", "pi"],      #U+03C0 (greek small letter pi)
  ["ρ", "rho"],     #U+03C1 (greek small letter rho)
  ["σ", "sigma"],   #U+03C3 (greek small letter sigma)
  ["τ", "tau"],     #U+03C4 (greek small letter tau)
  ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
  ["φ", "phi"],     #U+03C6 (greek small letter phi)
  ["χ", "chi"],     #U+03C7 (greek small letter chi)
  ["ψ", "psi"],     #U+03C8 (greek small letter psi)
  ["ω", "omega"],   #U+03C9 (greek small letter omega)

  ["Α", "Alpha"],   #U+0391 (greek capital letter alpha)
  ["Β", "Beta"],    #U+0392 (greek capital letter beta)
  ["Γ", "Gamma"],   #U+0393 (greek capital letter gamma)
  ["Δ", "Delta"],   #U+0394 (greek capital letter delta)
  ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
  ["Ζ", "Zeta"],    #U+0396 (greek capital letter zeta)
  ["Η", "Eta"],     #U+0397 (greek capital letter eta)
  ["Θ", "Theta"],   #U+0398 (greek capital letter theta)
  ["Ι", "Iota"],    #U+0399 (greek capital letter iota)
  ["Κ", "Kappa"],   #U+039A (greek capital letter kappa)
  ["Λ", "Lambda"],  #U+039B (greek capital letter lambda)
  ["Λ", "Lamda"],  #U+039B (greek capital letter lambda)
  ["Μ", "Mu"],      #U+039C (greek capital letter mu)
  ["Ν", "Nu"],      #U+039D (greek capital letter nu)
  ["Ξ", "Xi"],      #U+039E (greek capital letter xi)
  ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
  ["Π", "Pi"],      #U+03A0 (greek capital letter pi)
  ["Ρ", "Rho"],     #U+03A1 (greek capital letter rho)
  ["Σ", "Sigma"],   #U+03A3 (greek capital letter sigma)
  ["Τ", "Tau"],     #U+03A4 (greek capital letter tau)
  ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
  ["Φ", "Phi"],     #U+03A6 (greek capital letter phi)
  ["Χ", "Chi"],     #U+03A7 (greek capital letter chi)
  ["Ψ", "Psi"],     #U+03A8 (greek capital letter Psi)
  ["Ω", "Omega"],   #U+03A9 (greek capital letter omega)

  ["ϕ", "phi"],     #U+03D5 (greek phi symbol)

  ["×", "x"],       #U+00D7 (multiplication sign)
  ["", "*"],       #U+2022 (bullet)
  ["", " "],       #U+2009 (thin space)
  ["", " "],       #U+200A (hair space)
  [" ", " "],       #U+00A0 (no-break space)
  [" ", " "],       #U+3000 (ideographic space)
  ["", "-"],       #U+2212 (minus sign)
  ["", "-"],       #U+2013 (en dash)
  ["", "'"],       #U+2032 (prime)
  ["", "'"],       #U+2018 (left single quotation mark)
  ["", "'"],       #U+2019 (right single quotation mark)
  ["", '"'],       #U+201C (left double quotation mark)
  ["", '"'],        #U+201D (right double quotation mark)
  ['"', "''"]
]
NIL_CHARACTER =
'_'
SIMILARITY_THRESHOLD =
0.8
SIGNATURE_NGRAM =
5
NOMATCH_CHARS =
"@^|#$%&_"
MIN_LENGTH_FOR_APPROXIMATION =
50
BUFFER_RATE =
0.2

Class Method Summary collapse

Class Method Details

._find_divisions(target, sources) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/text_alignment/find_divisions.rb', line 36

def _find_divisions(target, sources)
  mode, m, c, offset_begin = nil, nil, nil, nil

  sources.each_with_index do |source, i|
    if target.size < source[:text].size
      mode = :t_in_s
      str1 = target
      str2 = source[:text]
    else
      mode = :s_in_t
      str1 = source[:text]
      str2 = target
    end

    len1 = str1.length
    len2 = str2.length

    offset_begin, offset_end = 0, -1
    offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

    unless offset_begin.nil?
      c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
      if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
        m = i
        break
      end
    end
  end

  # return remaining target and sources if m.nil?
  return [[-1, [target, sources.collect{|s| s[:divid]}]]] if m.nil?

  index = if mode == :t_in_s
    [sources[m][:divid], [0, target.size]]
  else # :s_in_t
    [sources[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
  end

  next_target = target[0 ... index[1][0]] + target[index[1][1] .. -1]
  sources.delete_at(m)

  if next_target.strip.empty? || sources.empty?
    return [index]
  else
    more_index = _find_divisions(next_target, sources)
    gap = index[1][1] - index[1][0]
    more_index.each do |i|
      if (i[0] > -1)
        i[1][0] += gap if i[1][0] >= index[1][0]
        i[1][1] += gap if i[1][1] >  index[1][0]
      end
    end
    return [index] + more_index
  end
end

.approximate_fit(str1, str2) ⇒ Object

If finds an approximate region of str2 that contains str1

Raises:

  • (ArgumentError)


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/text_alignment/approximate_fit.rb', line 14

def approximate_fit(str1, str2)
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

  ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
  ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
  ngram_shared = ngram1 & ngram2

  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
  return nil, nil if ngram_shared.empty?

  # approximate the beginning of the fit
  signature_ngram = ngram_shared.detect{|g| ngram2.count(g) == 1}

  return nil, nil if signature_ngram.nil? #raise "no signature ngram"
  offset = str1.index(signature_ngram)
  fit_begin = str2.index(signature_ngram) - offset - (offset * TextAlignment::BUFFER_RATE).to_i
  fit_begin = 0 if fit_begin < 0    

  # to change the order according to ngram2
  ngram_shared = ngram2 & ngram1

  # approximate the end of the fit
  ngram_shared_reverse = ngram_shared.reverse
  ngram2_reverse = ngram2.reverse
  signature_ngram = ngram_shared_reverse.detect{|g| ngram2_reverse.count(g) == 1}
  return nil, nil if signature_ngram.nil? # raise "no signature ngram" 
  offset = str1.length - str1.rindex(signature_ngram)
  fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
  fit_end = str2.length if fit_end > str2.length

  return nil, nil if fit_begin >= fit_end
  return fit_begin, fit_end
end

.cdiff(str1, str2) ⇒ Object

Raises:

  • (ArgumentError)


12
13
14
15
16
# File 'lib/text_alignment/lcs_cdiff.rb', line 12

def cdiff(str1, str2)
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
  raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
  sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end

.find_divisions(target, sources, mappings = []) ⇒ Object

It finds, among the sources, the right divisions for the taraget text to fit in.

Raises:

  • (ArgumentError)


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/text_alignment/find_divisions.rb', line 17

def find_divisions(target, sources, mappings = [])
  raise ArgumentError, "nil target"           if target == nil
  raise ArgumentError, "nil or empty sources" if sources == nil || sources.empty?
  raise ArgumentError, "nil mappings"         if mappings == nil

  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  target.tr!(characters_from, characters_to)
  sources.each{|source| source[:text].tr!(characters_from, characters_to)}

  sources.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

  TextAlignment._find_divisions(target, sources)
end

.glcs_required?(str1, mappings = []) ⇒ Boolean

Returns:

  • (Boolean)

Raises:

  • (ArgumentError)


5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/text_alignment/glcs_required.rb', line 5

def glcs_required?(str1, mappings = [])
  raise ArgumentError, "nil string" if str1.nil?
  raise ArgumentError, "nil mappings" if mappings.nil?

  # character mappings can be safely applied to the strings withoug changing the position of other characters
  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  str1.tr!(characters_from, characters_to)

  str1 =~/([^\p{ASCII}][^\p{ASCII}])/
  $1
end

.sdiff2cdiff(sdiff) ⇒ Object

Raises:

  • (ArgumentError)


18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/text_alignment/lcs_cdiff.rb', line 18

def sdiff2cdiff (sdiff)
  raise ArgumentError, "nil sdiff" if sdiff.nil?

  cdiff_str1, cdiff_str2 = '', ''

  sdiff.each do |h|
    case h.action
    when '='
      cdiff_str1 += h.old_element
      cdiff_str2 += h.new_element
    when '!'
      cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
      cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
    when '-'
      cdiff_str1 += h.old_element
      cdiff_str2 += TextAlignment::NIL_CHARACTER
    when '+'
      cdiff_str1 += TextAlignment::NIL_CHARACTER
      cdiff_str2 += h.new_element
    end
  end

  cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
end