Module: TextAlignment

Defined in:: lib/text_alignment/constants.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/char_mapping.rb,
lib/text_alignment/anchor_finder.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/cultivation_map.rb,
lib/text_alignment/mixed_alignment.rb,
lib/text_alignment/glcs_alignment_fast.rb

Defined Under Namespace

Classes: AnchorFinder, CharMapping, CultivationMap, GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, MixedAlignment, TextAlignment

Constant Summary collapse

SIZE_NGRAM =

SIZE_WINDOW =

BUFFER_RATE =

0.1

BUFFER_MIN =

TEXT_SIMILARITY_THRESHOLD =

0.9

NIL_CHARACTER =

'_'

CHAR_MAPPING =

[
	["©", "(c)"],			#U+00A9 (Copyright Sign)

	["α", "alpha"],		#U+03B1 (greek small letter alpha)
	["β", "beta"],		#U+03B2 (greek small letter beta)
	["γ", "gamma"],		#U+03B3 (greek small letter gamma)
	["δ", "delta"],		#U+03B4 (greek small letter delta)
	["ε", "epsilon"],	#U+03B5 (greek small letter epsilon)
	["ζ", "zeta"],		#U+03B6 (greek small letter zeta)
	["η", "eta"],			#U+03B7 (greek small letter eta)
	["θ", "theta"],		#U+03B7 (greek small letter eta)
	["ι", "iota"],		#U+03B7 (greek small letter eta)
	["κ", "kappa"],		#U+03BA (greek small letter kappa)
	["λ", "lambda"],	#U+03BB (greek small letter lambda)
	["λ", "lamda"],		#U+03BB (greek small letter lambda)
	["μ", "mu"],			#U+03BC (greek small letter mu)
	["ν", "nu"],			#U+03BD (greek small letter nu)
	["ξ", "xi"],			#U+03BE (greek small letter xi)
	["ο", "omicron"],	#U+03BF (greek small letter omicron)
	["π", "pi"],			#U+03C0 (greek small letter pi)
	["ρ", "rho"],			#U+03C1 (greek small letter rho)
	["σ", "sigma"],		#U+03C3 (greek small letter sigma)
	["τ", "tau"],			#U+03C4 (greek small letter tau)
	["υ", "upsilon"],	#U+03C5 (greek small letter upsilon)
	["φ", "phi"],			#U+03C6 (greek small letter phi)
	["χ", "chi"],			#U+03C7 (greek small letter chi)
	["ψ", "psi"],			#U+03C8 (greek small letter psi)
	["ω", "omega"],		#U+03C9 (greek small letter omega)

	["Α", "Alpha"],		#U+0391 (greek capital letter alpha)
	["Β", "Beta"],		#U+0392 (greek capital letter beta)
	["Γ", "Gamma"],		#U+0393 (greek capital letter gamma)
	["Δ", "Delta"],		#U+0394 (greek capital letter delta)
	["Ε", "Epsilon"],	#U+0395 (greek capital letter epsilon)
	["Ζ", "Zeta"],		#U+0396 (greek capital letter zeta)
	["Η", "Eta"],			#U+0397 (greek capital letter eta)
	["Θ", "Theta"],		#U+0398 (greek capital letter theta)
	["Ι", "Iota"],		#U+0399 (greek capital letter iota)
	["Κ", "Kappa"],		#U+039A (greek capital letter kappa)
	["Λ", "Lambda"],	#U+039B (greek capital letter lambda)
	["Λ", "Lamda"],		#U+039B (greek capital letter lambda)
	["Μ", "Mu"],			#U+039C (greek capital letter mu)
	["Ν", "Nu"],			#U+039D (greek capital letter nu)
	["Ξ", "Xi"],			#U+039E (greek capital letter xi)
	["Ο", "Omicron"],	#U+039F (greek capital letter omicron)
	["Π", "Pi"],			#U+03A0 (greek capital letter pi)
	["Ρ", "Rho"],			#U+03A1 (greek capital letter rho)
	["Σ", "Sigma"],		#U+03A3 (greek capital letter sigma)
	["Τ", "Tau"],			#U+03A4 (greek capital letter tau)
	["Υ", "Upsilon"],	#U+03A5 (greek capital letter upsilon)
	["Φ", "Phi"],			#U+03A6 (greek capital letter phi)
	["Χ", "Chi"],			#U+03A7 (greek capital letter chi)
	["Ψ", "Psi"],			#U+03A8 (greek capital letter Psi)
	["Ω", "Omega"],		#U+03A9 (greek capital letter omega)

	["ϕ", "phi"],			#U+03D5 (greek phi symbol)

	["×", "x"],				#U+00D7 (multiplication sign)
	["•", "*"],				#U+2022 (bullet)
	[" ", " "],				#U+2009 (thin space)
	[" ", " "],				#U+200A (hair space)
	[" ", " "],				#U+202F (narrow no-break space)
	[" ", " "],				#U+00A0 (Non-Breaking space)
	["　", " "],				#U+3000 (ideographic space)
	["‐", "-"],				#U+2010 (Hyphen)
	["‑", "-"],				#U+2011 (Non-Breaking Hyphen)
	["−", "-"],				#U+2212 (minus sign)
	["–", "-"],				#U+2013 (en dash)
	["′", "'"],				#U+2032 (prime)
	["‘", "'"],				#U+2018 (left single quotation mark)
	["’", "'"],				#U+2019 (right single quotation mark)
	["“", '"'],				#U+201C (left double quotation mark)
	["”", '"'],				#U+201D (right double quotation mark)
	['"', "''"]
]

SIMILARITY_THRESHOLD = to work on the hash representation of denotations to assume that there is no bag representation to this method

0.7

MIN_LENGTH_FOR_APPROXIMATION = approximate the location of str1 in str2

Class Method Summary collapse

._find_divisions(_source, _targets) ⇒ Object
._find_divisions_old(source, targets) ⇒ Object
.approximate_fit(str1, str2) ⇒ Object

If finds an approximate region of str2 that contains str1.
.cdiff(str1, str2) ⇒ Object
.find_divisions(source, targets, mappings = []) ⇒ Object

It finds, among the targets, the right divisions for the taraget text to fit in.
.glcs_required?(str1, mappings = []) ⇒ Boolean
.sdiff2cdiff(sdiff) ⇒ Object

Class Method Details

._find_divisions(_source, _targets) ⇒ `Object`

# File 'lib/text_alignment/find_divisions.rb', line 35

def _find_divisions(_source, _targets)
	indice = []
	history = []
	cache = {}
	source = _source.dup
	targets = _targets.dup
	until source.strip.empty? || targets.empty?
		mode, cmp = nil, nil
		candidates = []
		targets.each_with_index do |target, i|
			if source.size < target[:text].size
				mode = :t_in_s
				str1 = source
				str2 = target[:text]
			else
				mode = :s_in_t
				str1 = target[:text]
				str2 = source
			end

			len1 = str1.length
			len2 = str2.length

			offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
				approximate_fit(str1, str2)
			else
				# the whole source
				[0, -1]
			end

			unless offset_begin.nil?
				key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
				cmp = if cache.has_key? key
					cache[key]
				else
					cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
				end
				cache[key] = cmp

				if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
					candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
				end
			end
		end

		# return remaining source and targets if m.nil?
		break if candidates.empty?

		choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
		m = choice[:idx]
		mode = choice[:mode]

		index = if mode == :t_in_s
			{divid:targets[m][:divid], region:[0, source.size]}
		else # :s_in_t
			cmp = choice[:cmp]
			offset = choice[:offset]
			{divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
		end

		source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
		history << index[:region].dup

		before_begin = index[:region][0]
		before_end = index[:region][1]

		rhistory = history.reverse
		rhistory.shift
		rhistory.each do |h|
			gap = h[1] - h[0]
			index[:region][0] += gap if index[:region][0] >= h[0]
			index[:region][1] += gap if index[:region][1] >  h[0]
		end

		indice << index

		targets.delete_at(m)
	end

	unless source.strip.empty? && targets.empty?
		index = {divid:nil}
		index[:remaining_source] = source unless source.strip.empty?
		index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
		indice << index
	end

	indice
end

._find_divisions_old(source, targets) ⇒ `Object`

# File 'lib/text_alignment/find_divisions.rb', line 124

def _find_divisions_old(source, targets)
	mode, m, c, offset_begin = nil, nil, nil, nil

	targets.each_with_index do |target, i|
		if source.size < target[:text].size
			mode = :t_in_s
			str1 = source
			str2 = target[:text]
		else
			mode = :s_in_t
			str1 = target[:text]
			str2 = source
		end

		len1 = str1.length
		len2 = str2.length

		offset_begin, offset_end = 0, -1
		offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

		unless offset_begin.nil?
			c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
			if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
				m = i
				break
			end
		end
	end

	# return remaining source and targets if m.nil?
	return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?

	index = if mode == :t_in_s
		[targets[m][:divid], [0, source.size]]
	else # :s_in_t
		[targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
	end

	next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
	targets.delete_at(m)

	if next_source.strip.empty? || targets.empty?
		return [index]
	else
		more_index = _find_divisions(next_source, targets)
		gap = index[1][1] - index[1][0]
		more_index.each do |i|
			if (i[0] > -1)
				i[1][0] += gap if i[1][0] >= index[1][0]
				i[1][1] += gap if i[1][1] >  index[1][0]
			end
		end
		return [index] + more_index
	end
end

.approximate_fit(str1, str2) ⇒ `Object`

If finds an approximate region of str2 that contains str1

Raises:

(ArgumentError)

# File 'lib/text_alignment/approximate_fit.rb', line 13

def approximate_fit(str1, str2)
	raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
	return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

	ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
	ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
	ngram_shared = ngram1 & ngram2

	# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
	return nil, nil if ngram_shared.empty?

	signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
	return nil, nil if signature_ngrams.empty? #raise "no signature ngram"

	cache = {}
	fit_begin, fit_end = nil, nil
	signature_ngrams.each do |signature_ngram|
		loc_signature_ngram_in_str1 = str1.index(signature_ngram)
		loc_signature_ngram_in_str2 = str2.index(signature_ngram)

		# approximate the beginning of the fit
		fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
		fit_begin = 0 if fit_begin < 0

		# approximate the end of the fit
		offset_end = str1.length - loc_signature_ngram_in_str1
		fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
		fit_end = str2.length if fit_end > str2.length

		next if cache.has_key?("#{fit_begin}-#{fit_end}")
		text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
		cache["#{fit_begin}-#{fit_end}"] = text_similarity

		break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
		fit_begin, fit_end = nil, nil
	end
	return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
	return nil, nil
end

.cdiff(str1, str2) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/text_alignment/lcs_cdiff.rb', line 10

def cdiff(str1, str2)
	raise ArgumentError, "nil string" if str1.nil? || str2.nil?
	raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
	sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end

.find_divisions(source, targets, mappings = []) ⇒ `Object`

It finds, among the targets, the right divisions for the taraget text to fit in.

Raises:

(ArgumentError)

# File 'lib/text_alignment/find_divisions.rb', line 15

def find_divisions(source, targets, mappings = [])
	raise ArgumentError, "nil source"           if source == nil
	raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
	raise ArgumentError, "nil mappings"         if mappings == nil

	character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
	mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
	characters_from = character_mappings.collect{|m| m[0]}.join
	characters_to   = character_mappings.collect{|m| m[1]}.join
	characters_to.gsub!(/-/, '\-')

	source.tr!(characters_from, characters_to)
	targets.each{|target| target[:text].tr!(characters_from, characters_to)}

	# to process smaller ones first
	targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

	TextAlignment._find_divisions(source, targets)
end

.glcs_required?(str1, mappings = []) ⇒ `Boolean`

Returns:

(Boolean)

Raises:

(ArgumentError)

# File 'lib/text_alignment/glcs_required.rb', line 5

def glcs_required?(str1, mappings = [])
	raise ArgumentError, "nil string" if str1.nil?
	raise ArgumentError, "nil mappings" if mappings.nil?

	# character mappings can be safely applied to the strings withoug changing the position of other characters
	character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
	characters_from = character_mappings.collect{|m| m[0]}.join
	characters_to   = character_mappings.collect{|m| m[1]}.join
	characters_to.gsub!(/-/, '\-')

	str1.tr!(characters_from, characters_to)

	str1 =~/([^\p{ASCII}][^\p{ASCII}])/
	$1
end

.sdiff2cdiff(sdiff) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/text_alignment/lcs_cdiff.rb', line 16

def sdiff2cdiff (sdiff)
	raise ArgumentError, "nil sdiff" if sdiff.nil?

	cdiff_str1, cdiff_str2 = '', ''

	sdiff.each do |h|
		case h.action
		when '='
			cdiff_str1 += h.old_element
			cdiff_str2 += h.new_element
		when '!'
			cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
			cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
		when '-'
			cdiff_str1 += h.old_element
			cdiff_str2 += TextAlignment::NIL_CHARACTER
		when '+'
			cdiff_str1 += TextAlignment::NIL_CHARACTER
			cdiff_str2 += h.new_element
		end
	end

	cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
end

Module: TextAlignment

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

._find_divisions(_source, _targets) ⇒ Object

._find_divisions_old(source, targets) ⇒ Object

.approximate_fit(str1, str2) ⇒ Object

.cdiff(str1, str2) ⇒ Object

.find_divisions(source, targets, mappings = []) ⇒ Object

.glcs_required?(str1, mappings = []) ⇒ Boolean

.sdiff2cdiff(sdiff) ⇒ Object

._find_divisions(_source, _targets) ⇒ `Object`

._find_divisions_old(source, targets) ⇒ `Object`

.approximate_fit(str1, str2) ⇒ `Object`

.cdiff(str1, str2) ⇒ `Object`

.find_divisions(source, targets, mappings = []) ⇒ `Object`

.glcs_required?(str1, mappings = []) ⇒ `Boolean`

.sdiff2cdiff(sdiff) ⇒ `Object`