Module: TextAlignment

Defined in:
lib/text_alignment/constants.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/char_mapping.rb,
lib/text_alignment/anchor_finder.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/cultivation_map.rb,
lib/text_alignment/mixed_alignment.rb,
lib/text_alignment/glcs_alignment_fast.rb

Defined Under Namespace

Classes: AnchorFinder, CharMapping, CultivationMap, GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, MixedAlignment, TextAlignment

Constant Summary collapse

SIZE_NGRAM =
8
SIZE_WINDOW =
10
BUFFER_RATE =
0.1
BUFFER_MIN =
20
TEXT_SIMILARITY_THRESHOLD =
0.9
NIL_CHARACTER =
'_'
CHAR_MAPPING =
[
	["©", "(c)"],			#U+00A9 (Copyright Sign)

	["α", "alpha"],		#U+03B1 (greek small letter alpha)
	["β", "beta"],		#U+03B2 (greek small letter beta)
	["γ", "gamma"],		#U+03B3 (greek small letter gamma)
	["δ", "delta"],		#U+03B4 (greek small letter delta)
	["ε", "epsilon"],	#U+03B5 (greek small letter epsilon)
	["ζ", "zeta"],		#U+03B6 (greek small letter zeta)
	["η", "eta"],			#U+03B7 (greek small letter eta)
	["θ", "theta"],		#U+03B7 (greek small letter eta)
	["ι", "iota"],		#U+03B7 (greek small letter eta)
	["κ", "kappa"],		#U+03BA (greek small letter kappa)
	["λ", "lambda"],	#U+03BB (greek small letter lambda)
	["λ", "lamda"],		#U+03BB (greek small letter lambda)
	["μ", "mu"],			#U+03BC (greek small letter mu)
	["ν", "nu"],			#U+03BD (greek small letter nu)
	["ξ", "xi"],			#U+03BE (greek small letter xi)
	["ο", "omicron"],	#U+03BF (greek small letter omicron)
	["π", "pi"],			#U+03C0 (greek small letter pi)
	["ρ", "rho"],			#U+03C1 (greek small letter rho)
	["σ", "sigma"],		#U+03C3 (greek small letter sigma)
	["τ", "tau"],			#U+03C4 (greek small letter tau)
	["υ", "upsilon"],	#U+03C5 (greek small letter upsilon)
	["φ", "phi"],			#U+03C6 (greek small letter phi)
	["χ", "chi"],			#U+03C7 (greek small letter chi)
	["ψ", "psi"],			#U+03C8 (greek small letter psi)
	["ω", "omega"],		#U+03C9 (greek small letter omega)

	["Α", "Alpha"],		#U+0391 (greek capital letter alpha)
	["Β", "Beta"],		#U+0392 (greek capital letter beta)
	["Γ", "Gamma"],		#U+0393 (greek capital letter gamma)
	["Δ", "Delta"],		#U+0394 (greek capital letter delta)
	["Ε", "Epsilon"],	#U+0395 (greek capital letter epsilon)
	["Ζ", "Zeta"],		#U+0396 (greek capital letter zeta)
	["Η", "Eta"],			#U+0397 (greek capital letter eta)
	["Θ", "Theta"],		#U+0398 (greek capital letter theta)
	["Ι", "Iota"],		#U+0399 (greek capital letter iota)
	["Κ", "Kappa"],		#U+039A (greek capital letter kappa)
	["Λ", "Lambda"],	#U+039B (greek capital letter lambda)
	["Λ", "Lamda"],		#U+039B (greek capital letter lambda)
	["Μ", "Mu"],			#U+039C (greek capital letter mu)
	["Ν", "Nu"],			#U+039D (greek capital letter nu)
	["Ξ", "Xi"],			#U+039E (greek capital letter xi)
	["Ο", "Omicron"],	#U+039F (greek capital letter omicron)
	["Π", "Pi"],			#U+03A0 (greek capital letter pi)
	["Ρ", "Rho"],			#U+03A1 (greek capital letter rho)
	["Σ", "Sigma"],		#U+03A3 (greek capital letter sigma)
	["Τ", "Tau"],			#U+03A4 (greek capital letter tau)
	["Υ", "Upsilon"],	#U+03A5 (greek capital letter upsilon)
	["Φ", "Phi"],			#U+03A6 (greek capital letter phi)
	["Χ", "Chi"],			#U+03A7 (greek capital letter chi)
	["Ψ", "Psi"],			#U+03A8 (greek capital letter Psi)
	["Ω", "Omega"],		#U+03A9 (greek capital letter omega)

	["ϕ", "phi"],			#U+03D5 (greek phi symbol)

	["×", "x"],				#U+00D7 (multiplication sign)
	["", "*"],				#U+2022 (bullet)
	["", " "],				#U+2009 (thin space)
	["", " "],				#U+200A (hair space)
	["", " "],				#U+202F (narrow no-break space)
	[" ", " "],				#U+00A0 (Non-Breaking space)
	[" ", " "],				#U+3000 (ideographic space)
	["", "-"],				#U+2010 (Hyphen)
	["", "-"],				#U+2011 (Non-Breaking Hyphen)
	["", "-"],				#U+2212 (minus sign)
	["", "-"],				#U+2013 (en dash)
	["", "'"],				#U+2032 (prime)
	["", "'"],				#U+2018 (left single quotation mark)
	["", "'"],				#U+2019 (right single quotation mark)
	["", '"'],				#U+201C (left double quotation mark)
	["", '"'],				#U+201D (right double quotation mark)
	['"', "''"]
]
SIMILARITY_THRESHOLD =

to work on the hash representation of denotations to assume that there is no bag representation to this method

0.7
MIN_LENGTH_FOR_APPROXIMATION =

approximate the location of str1 in str2

50

Class Method Summary collapse

Class Method Details

._find_divisions(_source, _targets) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/text_alignment/find_divisions.rb', line 35

def _find_divisions(_source, _targets)
	indice = []
	history = []
	cache = {}
	source = _source.dup
	targets = _targets.dup
	until source.strip.empty? || targets.empty?
		mode, cmp = nil, nil
		candidates = []
		targets.each_with_index do |target, i|
			if source.size < target[:text].size
				mode = :t_in_s
				str1 = source
				str2 = target[:text]
			else
				mode = :s_in_t
				str1 = target[:text]
				str2 = source
			end

			len1 = str1.length
			len2 = str2.length

			offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
				approximate_fit(str1, str2)
			else
				# the whole source
				[0, -1]
			end

			unless offset_begin.nil?
				key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
				cmp = if cache.has_key? key
					cache[key]
				else
					cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
				end
				cache[key] = cmp

				if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
					candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
				end
			end
		end

		# return remaining source and targets if m.nil?
		break if candidates.empty?

		choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
		m = choice[:idx]
		mode = choice[:mode]

		index = if mode == :t_in_s
			{divid:targets[m][:divid], region:[0, source.size]}
		else # :s_in_t
			cmp = choice[:cmp]
			offset = choice[:offset]
			{divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
		end

		source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
		history << index[:region].dup

		before_begin = index[:region][0]
		before_end = index[:region][1]

		rhistory = history.reverse
		rhistory.shift
		rhistory.each do |h|
			gap = h[1] - h[0]
			index[:region][0] += gap if index[:region][0] >= h[0]
			index[:region][1] += gap if index[:region][1] >  h[0]
		end

		indice << index

		targets.delete_at(m)
	end

	unless source.strip.empty? && targets.empty?
		index = {divid:nil}
		index[:remaining_source] = source unless source.strip.empty?
		index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
		indice << index
	end

	indice
end

._find_divisions_old(source, targets) ⇒ Object



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/text_alignment/find_divisions.rb', line 124

def _find_divisions_old(source, targets)
	mode, m, c, offset_begin = nil, nil, nil, nil

	targets.each_with_index do |target, i|
		if source.size < target[:text].size
			mode = :t_in_s
			str1 = source
			str2 = target[:text]
		else
			mode = :s_in_t
			str1 = target[:text]
			str2 = source
		end

		len1 = str1.length
		len2 = str2.length

		offset_begin, offset_end = 0, -1
		offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

		unless offset_begin.nil?
			c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
			if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
				m = i
				break
			end
		end
	end

	# return remaining source and targets if m.nil?
	return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?

	index = if mode == :t_in_s
		[targets[m][:divid], [0, source.size]]
	else # :s_in_t
		[targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
	end

	next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
	targets.delete_at(m)

	if next_source.strip.empty? || targets.empty?
		return [index]
	else
		more_index = _find_divisions(next_source, targets)
		gap = index[1][1] - index[1][0]
		more_index.each do |i|
			if (i[0] > -1)
				i[1][0] += gap if i[1][0] >= index[1][0]
				i[1][1] += gap if i[1][1] >  index[1][0]
			end
		end
		return [index] + more_index
	end
end

.approximate_fit(str1, str2) ⇒ Object

If finds an approximate region of str2 that contains str1

Raises:

  • (ArgumentError)


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/text_alignment/approximate_fit.rb', line 13

def approximate_fit(str1, str2)
	raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
	return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

	ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
	ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
	ngram_shared = ngram1 & ngram2

	# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
	return nil, nil if ngram_shared.empty?

	signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
	return nil, nil if signature_ngrams.empty? #raise "no signature ngram"

	cache = {}
	fit_begin, fit_end = nil, nil
	signature_ngrams.each do |signature_ngram|
		loc_signature_ngram_in_str1 = str1.index(signature_ngram)
		loc_signature_ngram_in_str2 = str2.index(signature_ngram)

		# approximate the beginning of the fit
		fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
		fit_begin = 0 if fit_begin < 0

		# approximate the end of the fit
		offset_end = str1.length - loc_signature_ngram_in_str1
		fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
		fit_end = str2.length if fit_end > str2.length

		next if cache.has_key?("#{fit_begin}-#{fit_end}")
		text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
		cache["#{fit_begin}-#{fit_end}"] = text_similarity

		break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
		fit_begin, fit_end = nil, nil
	end
	return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
	return nil, nil
end

.cdiff(str1, str2) ⇒ Object

Raises:

  • (ArgumentError)


10
11
12
13
14
# File 'lib/text_alignment/lcs_cdiff.rb', line 10

def cdiff(str1, str2)
	raise ArgumentError, "nil string" if str1.nil? || str2.nil?
	raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
	sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end

.find_divisions(source, targets, mappings = []) ⇒ Object

It finds, among the targets, the right divisions for the taraget text to fit in.

Raises:

  • (ArgumentError)


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/text_alignment/find_divisions.rb', line 15

def find_divisions(source, targets, mappings = [])
	raise ArgumentError, "nil source"           if source == nil
	raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
	raise ArgumentError, "nil mappings"         if mappings == nil

	character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
	mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
	characters_from = character_mappings.collect{|m| m[0]}.join
	characters_to   = character_mappings.collect{|m| m[1]}.join
	characters_to.gsub!(/-/, '\-')

	source.tr!(characters_from, characters_to)
	targets.each{|target| target[:text].tr!(characters_from, characters_to)}

	# to process smaller ones first
	targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

	TextAlignment._find_divisions(source, targets)
end

.glcs_required?(str1, mappings = []) ⇒ Boolean

Returns:

  • (Boolean)

Raises:

  • (ArgumentError)


5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/text_alignment/glcs_required.rb', line 5

def glcs_required?(str1, mappings = [])
	raise ArgumentError, "nil string" if str1.nil?
	raise ArgumentError, "nil mappings" if mappings.nil?

	# character mappings can be safely applied to the strings withoug changing the position of other characters
	character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
	characters_from = character_mappings.collect{|m| m[0]}.join
	characters_to   = character_mappings.collect{|m| m[1]}.join
	characters_to.gsub!(/-/, '\-')

	str1.tr!(characters_from, characters_to)

	str1 =~/([^\p{ASCII}][^\p{ASCII}])/
	$1
end

.sdiff2cdiff(sdiff) ⇒ Object

Raises:

  • (ArgumentError)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/text_alignment/lcs_cdiff.rb', line 16

def sdiff2cdiff (sdiff)
	raise ArgumentError, "nil sdiff" if sdiff.nil?

	cdiff_str1, cdiff_str2 = '', ''

	sdiff.each do |h|
		case h.action
		when '='
			cdiff_str1 += h.old_element
			cdiff_str2 += h.new_element
		when '!'
			cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
			cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
		when '-'
			cdiff_str1 += h.old_element
			cdiff_str2 += TextAlignment::NIL_CHARACTER
		when '+'
			cdiff_str1 += TextAlignment::NIL_CHARACTER
			cdiff_str2 += h.new_element
		end
	end

	cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
end