Class: TextAlignment::LCSMin

Inherits:
Object
  • Object
show all
Defined in:
lib/text_alignment/lcs_min.rb

Overview

It finds minimal lcs and sdiff of the given strings, str1 and str2. It relies on the diff-lcs gem for the computation of lcs table.

Constant Summary collapse

WHITESPACE =
["\t", "\n", "\v", "\f", "\r", " "].join.freeze
PLACEHOLDER_CHAR =
'_'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str1, str2) ⇒ LCSMin

Returns a new instance of LCSMin.

Raises:

  • (ArgumentError)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/text_alignment/lcs_min.rb', line 21

def initialize (str1, str2)
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
  raise ArgumentError, "empty string" if str1.empty? || str2.empty?

  # str1 is copied as it is.
  # str2 is copied with w/s characters replaced with the placeholder characters,
  # to avoid overfitting to w/s characters during LCS computation.
  @str1 = str1
  @str2 = str2.tr(WHITESPACE, PLACEHOLDER_CHAR)

  # find the corresponding minimal range of the two strings
  r = _find_min_range(0, @str1.length - 1, 0, @str2.length - 1)
  if r.nil?
    @sdiff = nil
    @lcs = 0
    return
  end

  @m1_initial, @m1_final, @m2_initial, @m2_final = r[:m1_initial], r[:m1_final], r[:m2_initial], r[:m2_final]

  if @m1_initial.nil?
    @sdiff = nil
    @lcs = 0
  else
    # compute sdiff and lcs
    # here the original str2 is used with all the w/s characters preserved.
    @sdiff = Diff::LCS.sdiff(@str1[@m1_initial..@m1_final], str2[@m2_initial..@m2_final])
    @lcs = @sdiff.count{|d| d.action == '='}

    # adjust the position values of sdiff
    @sdiff.each do |h|
      h.old_position += @m1_initial unless h.old_position.nil?
      h.new_position += @m2_initial unless h.new_position.nil?
    end

    (0 ... @m2_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
    (0 ... @m1_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
    (@m1_final + 1 ... @str1.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
    (@m2_final + 1 ... @str2.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
  end
end

Instance Attribute Details

#lcsObject (readonly)

Returns the value of attribute lcs.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def lcs
  @lcs
end

#m1_finalObject (readonly)

Returns the value of attribute m1_final.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def m1_final
  @m1_final
end

#m1_initialObject (readonly)

Returns the value of attribute m1_initial.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def m1_initial
  @m1_initial
end

#m2_finalObject (readonly)

Returns the value of attribute m2_final.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def m2_final
  @m2_final
end

#m2_initialObject (readonly)

Returns the value of attribute m2_initial.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def m2_initial
  @m2_initial
end

#sdiffObject (readonly)

Returns the value of attribute sdiff.



14
15
16
# File 'lib/text_alignment/lcs_min.rb', line 14

def sdiff
  @sdiff
end

Instance Method Details

#_find_min_range(m1_initial, m1_final, m2_initial, m2_final, clcs = 0) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/text_alignment/lcs_min.rb', line 63

def _find_min_range (m1_initial, m1_final, m2_initial, m2_final, clcs = 0)
  return nil if (m1_final - m1_initial < 0) || (m2_final - m2_initial < 0)
  sdiff = Diff::LCS.sdiff(@str1[m1_initial..m1_final], @str2[m2_initial..m2_final])
  lcs = sdiff.count{|d| d.action == '='}

  return nil if lcs == 0
  return nil if lcs < clcs

  match_last  = sdiff.rindex{|d| d.action == '='}
  m1_final    = sdiff[match_last].old_position + m1_initial
  m2_final    = sdiff[match_last].new_position + m2_initial

  match_first = sdiff.index{|d| d.action == '='}
  m1_initial  = sdiff[match_first].old_position + m1_initial
  m2_initial  = sdiff[match_first].new_position + m2_initial

  # attempt for shorter match
  if ((m1_final - m1_initial) > (m2_final - m2_initial))
    r = _find_min_range(m1_initial + 1, m1_final, m2_initial, m2_final, lcs)
    return r unless r.nil?
    r = _find_min_range(m1_initial, m1_final - 1, m2_initial, m2_final, lcs)
    return r unless r.nil?
  else
    r = _find_min_range(m1_initial, m1_final, m2_initial + 1, m2_final, lcs)
    return r unless r.nil?
    r = _find_min_range(m1_initial, m1_final, m2_initial, m2_final - 1, lcs)
    return r unless r.nil?
  end

  return {
    m1_initial: m1_initial,
    m1_final: m1_final,
    m2_initial: m2_initial,
    m2_final: m2_final
  }
end

#num_big_gaps(sdiff, initial, last) ⇒ Object

Raises:

  • (ArgumentError)


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/text_alignment/lcs_min.rb', line 100

def num_big_gaps (sdiff, initial, last)
  raise ArgumentError, "nil sdiff" if sdiff.nil?
  raise ArgumentError, "invalid indice: #{initial}, #{last}" unless last >= initial

  state1 = :initial
  state2 = :initial
  gaps1 = []
  gaps2 = []

  (initial .. last).each do |i|
    case sdiff[i].action
    when '='
      state1 = :continue
      state2 = :continue
    when '!'
      gaps1 << 1
      state1 = :break

      if state2 == :break
        gaps2[-1] += 1
      else
        gaps2 << 1
      end
      state2 = :continue
    when '+'
      if state1 == :break
        gaps1[-1] += 1
      else
        gaps1 << 1
      end
      state1 = :break
    when '-'
      if state2 == :break
        gaps2[-1] += 1
      else
        gaps2 << 1
      end
      state2 = :break
    end
  end

  num_big_gaps1 = gaps1.select{|g| g > MAX_LEN_BIG_GAP}.length
  num_big_gaps2 = gaps2.select{|g| g > MAX_LEN_BIG_GAP}.length
  num_big_gaps1 + num_big_gaps2
end