Module: TextAlignment
- Defined in:
- lib/text_alignment/constants.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/char_mapping.rb,
lib/text_alignment/anchor_finder.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/cultivation_map.rb,
lib/text_alignment/mixed_alignment.rb,
lib/text_alignment/glcs_alignment_fast.rb
Defined Under Namespace
Classes: AnchorFinder, CharMapping, CultivationMap, GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, MixedAlignment, TextAlignment
Constant Summary collapse
- SIZE_NGRAM =
8
- SIZE_WINDOW =
10
- BUFFER_RATE =
0.1
- BUFFER_MIN =
20
- TEXT_SIMILARITY_THRESHOLD =
0.9
- NIL_CHARACTER =
'_'
- CHAR_MAPPING =
[ ["©", "(c)"], #U+00A9 (Copyright Sign) ["α", "alpha"], #U+03B1 (greek small letter alpha) ["β", "beta"], #U+03B2 (greek small letter beta) ["γ", "gamma"], #U+03B3 (greek small letter gamma) ["δ", "delta"], #U+03B4 (greek small letter delta) ["ε", "epsilon"], #U+03B5 (greek small letter epsilon) ["ζ", "zeta"], #U+03B6 (greek small letter zeta) ["η", "eta"], #U+03B7 (greek small letter eta) ["θ", "theta"], #U+03B7 (greek small letter eta) ["ι", "iota"], #U+03B7 (greek small letter eta) ["κ", "kappa"], #U+03BA (greek small letter kappa) ["λ", "lambda"], #U+03BB (greek small letter lambda) ["λ", "lamda"], #U+03BB (greek small letter lambda) ["μ", "mu"], #U+03BC (greek small letter mu) ["ν", "nu"], #U+03BD (greek small letter nu) ["ξ", "xi"], #U+03BE (greek small letter xi) ["ο", "omicron"], #U+03BF (greek small letter omicron) ["π", "pi"], #U+03C0 (greek small letter pi) ["ρ", "rho"], #U+03C1 (greek small letter rho) ["σ", "sigma"], #U+03C3 (greek small letter sigma) ["τ", "tau"], #U+03C4 (greek small letter tau) ["υ", "upsilon"], #U+03C5 (greek small letter upsilon) ["φ", "phi"], #U+03C6 (greek small letter phi) ["χ", "chi"], #U+03C7 (greek small letter chi) ["ψ", "psi"], #U+03C8 (greek small letter psi) ["ω", "omega"], #U+03C9 (greek small letter omega) ["Α", "Alpha"], #U+0391 (greek capital letter alpha) ["Β", "Beta"], #U+0392 (greek capital letter beta) ["Γ", "Gamma"], #U+0393 (greek capital letter gamma) ["Δ", "Delta"], #U+0394 (greek capital letter delta) ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon) ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta) ["Η", "Eta"], #U+0397 (greek capital letter eta) ["Θ", "Theta"], #U+0398 (greek capital letter theta) ["Ι", "Iota"], #U+0399 (greek capital letter iota) ["Κ", "Kappa"], #U+039A (greek capital letter kappa) ["Λ", "Lambda"], #U+039B (greek capital letter lambda) ["Λ", "Lamda"], #U+039B (greek capital letter lambda) ["Μ", "Mu"], #U+039C (greek capital letter mu) ["Ν", "Nu"], #U+039D (greek capital letter nu) ["Ξ", "Xi"], #U+039E (greek capital letter xi) ["Ο", "Omicron"], #U+039F (greek capital letter omicron) ["Π", "Pi"], #U+03A0 (greek capital letter pi) ["Ρ", "Rho"], #U+03A1 (greek capital letter rho) ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma) ["Τ", "Tau"], #U+03A4 (greek capital letter tau) ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon) ["Φ", "Phi"], #U+03A6 (greek capital letter phi) ["Χ", "Chi"], #U+03A7 (greek capital letter chi) ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi) ["Ω", "Omega"], #U+03A9 (greek capital letter omega) ["ϕ", "phi"], #U+03D5 (greek phi symbol) ["×", "x"], #U+00D7 (multiplication sign) ["•", "*"], #U+2022 (bullet) [" ", " "], #U+2009 (thin space) [" ", " "], #U+200A (hair space) [" ", " "], #U+202F (narrow no-break space) [" ", " "], #U+00A0 (Non-Breaking space) [" ", " "], #U+3000 (ideographic space) ["‐", "-"], #U+2010 (Hyphen) ["‑", "-"], #U+2011 (Non-Breaking Hyphen) ["−", "-"], #U+2212 (minus sign) ["–", "-"], #U+2013 (en dash) ["′", "'"], #U+2032 (prime) ["‘", "'"], #U+2018 (left single quotation mark) ["’", "'"], #U+2019 (right single quotation mark) ["“", '"'], #U+201C (left double quotation mark) ["”", '"'], #U+201D (right double quotation mark) ['"', "''"] ]
- SIMILARITY_THRESHOLD =
to work on the hash representation of denotations to assume that there is no bag representation to this method
0.7
- MIN_LENGTH_FOR_APPROXIMATION =
approximate the location of str1 in str2
50
Class Method Summary collapse
- ._find_divisions(_source, _targets) ⇒ Object
- ._find_divisions_old(source, targets) ⇒ Object
-
.approximate_fit(str1, str2) ⇒ Object
If finds an approximate region of str2 that contains str1.
- .cdiff(str1, str2) ⇒ Object
-
.find_divisions(source, targets, mappings = []) ⇒ Object
It finds, among the targets, the right divisions for the taraget text to fit in.
- .glcs_required?(str1, mappings = []) ⇒ Boolean
- .sdiff2cdiff(sdiff) ⇒ Object
Class Method Details
._find_divisions(_source, _targets) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/text_alignment/find_divisions.rb', line 35 def _find_divisions(_source, _targets) indice = [] history = [] cache = {} source = _source.dup targets = _targets.dup until source.strip.empty? || targets.empty? mode, cmp = nil, nil candidates = [] targets.each_with_index do |target, i| if source.size < target[:text].size mode = :t_in_s str1 = source str2 = target[:text] else mode = :s_in_t str1 = target[:text] str2 = source end len1 = str1.length len2 = str2.length offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD) approximate_fit(str1, str2) else # the whole source [0, -1] end unless offset_begin.nil? key = str1 + ' _:_ ' + str2[offset_begin .. offset_end] cmp = if cache.has_key? key cache[key] else cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end]) end cache[key] = cmp if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)) candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp} end end end # return remaining source and targets if m.nil? break if candidates.empty? choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity} m = choice[:idx] mode = choice[:mode] index = if mode == :t_in_s {divid:targets[m][:divid], region:[0, source.size]} else # :s_in_t cmp = choice[:cmp] offset = choice[:offset] {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]} end source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1] history << index[:region].dup before_begin = index[:region][0] before_end = index[:region][1] rhistory = history.reverse rhistory.shift rhistory.each do |h| gap = h[1] - h[0] index[:region][0] += gap if index[:region][0] >= h[0] index[:region][1] += gap if index[:region][1] > h[0] end indice << index targets.delete_at(m) end unless source.strip.empty? && targets.empty? index = {divid:nil} index[:remaining_source] = source unless source.strip.empty? index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty? indice << index end indice end |
._find_divisions_old(source, targets) ⇒ Object
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/text_alignment/find_divisions.rb', line 124 def _find_divisions_old(source, targets) mode, m, c, offset_begin = nil, nil, nil, nil targets.each_with_index do |target, i| if source.size < target[:text].size mode = :t_in_s str1 = source str2 = target[:text] else mode = :s_in_t str1 = target[:text] str2 = source end len1 = str1.length len2 = str2.length offset_begin, offset_end = 0, -1 offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD) unless offset_begin.nil? c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end]) if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)) m = i break end end end # return remaining source and targets if m.nil? return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil? index = if mode == :t_in_s [targets[m][:divid], [0, source.size]] else # :s_in_t [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]] end next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1] targets.delete_at(m) if next_source.strip.empty? || targets.empty? return [index] else more_index = _find_divisions(next_source, targets) gap = index[1][1] - index[1][0] more_index.each do |i| if (i[0] > -1) i[1][0] += gap if i[1][0] >= index[1][0] i[1][1] += gap if i[1][1] > index[1][0] end end return [index] + more_index end end |
.approximate_fit(str1, str2) ⇒ Object
If finds an approximate region of str2 that contains str1
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/text_alignment/approximate_fit.rb', line 13 def approximate_fit(str1, str2) raise ArgumentError, 'nil string' if str1.nil? || str2.nil? return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]} ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]} ngram_shared = ngram1 & ngram2 # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings return nil, nil if ngram_shared.empty? signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1} return nil, nil if signature_ngrams.empty? #raise "no signature ngram" cache = {} fit_begin, fit_end = nil, nil signature_ngrams.each do |signature_ngram| loc_signature_ngram_in_str1 = str1.index(signature_ngram) loc_signature_ngram_in_str2 = str2.index(signature_ngram) # approximate the beginning of the fit fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i fit_begin = 0 if fit_begin < 0 # approximate the end of the fit offset_end = str1.length - loc_signature_ngram_in_str1 fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i fit_end = str2.length if fit_end > str2.length next if cache.has_key?("#{fit_begin}-#{fit_end}") text_similarity = text_similarity(str1, str2[fit_begin ... fit_end]) cache["#{fit_begin}-#{fit_end}"] = text_similarity break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD fit_begin, fit_end = nil, nil end return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end return nil, nil end |
.cdiff(str1, str2) ⇒ Object
10 11 12 13 14 |
# File 'lib/text_alignment/lcs_cdiff.rb', line 10 def cdiff(str1, str2) raise ArgumentError, "nil string" if str1.nil? || str2.nil? raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER) sdiff2cdiff(Diff::LCS.sdiff(str1, str2)) end |
.find_divisions(source, targets, mappings = []) ⇒ Object
It finds, among the targets, the right divisions for the taraget text to fit in.
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/text_alignment/find_divisions.rb', line 15 def find_divisions(source, targets, mappings = []) raise ArgumentError, "nil source" if source == nil raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty? raise ArgumentError, "nil mappings" if mappings == nil character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1} mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1} characters_from = character_mappings.collect{|m| m[0]}.join characters_to = character_mappings.collect{|m| m[1]}.join characters_to.gsub!(/-/, '\-') source.tr!(characters_from, characters_to) targets.each{|target| target[:text].tr!(characters_from, characters_to)} # to process smaller ones first targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size} TextAlignment._find_divisions(source, targets) end |
.glcs_required?(str1, mappings = []) ⇒ Boolean
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/text_alignment/glcs_required.rb', line 5 def glcs_required?(str1, mappings = []) raise ArgumentError, "nil string" if str1.nil? raise ArgumentError, "nil mappings" if mappings.nil? # character mappings can be safely applied to the strings withoug changing the position of other characters character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1} characters_from = character_mappings.collect{|m| m[0]}.join characters_to = character_mappings.collect{|m| m[1]}.join characters_to.gsub!(/-/, '\-') str1.tr!(characters_from, characters_to) str1 =~/([^\p{ASCII}][^\p{ASCII}])/ $1 end |
.sdiff2cdiff(sdiff) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/text_alignment/lcs_cdiff.rb', line 16 def sdiff2cdiff (sdiff) raise ArgumentError, "nil sdiff" if sdiff.nil? cdiff_str1, cdiff_str2 = '', '' sdiff.each do |h| case h.action when '=' cdiff_str1 += h.old_element cdiff_str2 += h.new_element when '!' cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element when '-' cdiff_str1 += h.old_element cdiff_str2 += TextAlignment::NIL_CHARACTER when '+' cdiff_str1 += TextAlignment::NIL_CHARACTER cdiff_str2 += h.new_element end end cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ') end |