Module: TextAlignment

Defined in:: lib/text_alignment/constants.rb,
lib/text_alignment/lcs_cdiff.rb,
lib/text_alignment/char_mapping.rb,
lib/text_alignment/anchor_finder.rb,
lib/text_alignment/glcs_required.rb,
lib/text_alignment/find_divisions.rb,
lib/text_alignment/glcs_alignment.rb,
lib/text_alignment/lcs_comparison.rb,
lib/text_alignment/text_alignment.rb,
lib/text_alignment/approximate_fit.rb,
lib/text_alignment/cultivation_map.rb,
lib/text_alignment/mixed_alignment.rb,
lib/text_alignment/glcs_alignment_fast.rb

Defined Under Namespace

Classes: AnchorFinder, CharMapping, CultivationMap, GLCSAlignment, GLCSTextAlignment, LCSAlignment, LCSComparison, LCSMin, MixedAlignment, TextAlignment

Constant Summary collapse

SIZE_NGRAM =

SIZE_WINDOW =

BUFFER_RATE =

0.1

BUFFER_MIN =

TEXT_SIMILARITY_THRESHOLD =

0.9

NIL_CHARACTER =

'_'

CHAR_MAPPING =

[
  ["©", "(c)"],      #U+00A9 (Copyright Sign)

  ["α", "alpha"],    #U+03B1 (greek small letter alpha)
  ["β", "beta"],   #U+03B2 (greek small letter beta)
  ["γ", "gamma"],    #U+03B3 (greek small letter gamma)
  ["δ", "delta"],    #U+03B4 (greek small letter delta)
  ["ε", "epsilon"],  #U+03B5 (greek small letter epsilon)
  ["ζ", "zeta"],   #U+03B6 (greek small letter zeta)
  ["η", "eta"],      #U+03B7 (greek small letter eta)
  ["θ", "theta"],    #U+03B7 (greek small letter eta)
  ["ι", "iota"],   #U+03B7 (greek small letter eta)
  ["κ", "kappa"],    #U+03BA (greek small letter kappa)
  ["λ", "lambda"], #U+03BB (greek small letter lambda)
  ["λ", "lamda"],    #U+03BB (greek small letter lambda)
  ["μ", "mu"],     #U+03BC (greek small letter mu)
  ["ν", "nu"],     #U+03BD (greek small letter nu)
  ["ξ", "xi"],     #U+03BE (greek small letter xi)
  ["ο", "omicron"],  #U+03BF (greek small letter omicron)
  ["π", "pi"],     #U+03C0 (greek small letter pi)
  ["ρ", "rho"],      #U+03C1 (greek small letter rho)
  ["σ", "sigma"],    #U+03C3 (greek small letter sigma)
  ["τ", "tau"],      #U+03C4 (greek small letter tau)
  ["υ", "upsilon"],  #U+03C5 (greek small letter upsilon)
  ["φ", "phi"],      #U+03C6 (greek small letter phi)
  ["χ", "chi"],      #U+03C7 (greek small letter chi)
  ["ψ", "psi"],      #U+03C8 (greek small letter psi)
  ["ω", "omega"],    #U+03C9 (greek small letter omega)

  ["Α", "Alpha"],    #U+0391 (greek capital letter alpha)
  ["Β", "Beta"],   #U+0392 (greek capital letter beta)
  ["Γ", "Gamma"],    #U+0393 (greek capital letter gamma)
  ["Δ", "Delta"],    #U+0394 (greek capital letter delta)
  ["Ε", "Epsilon"],  #U+0395 (greek capital letter epsilon)
  ["Ζ", "Zeta"],   #U+0396 (greek capital letter zeta)
  ["Η", "Eta"],      #U+0397 (greek capital letter eta)
  ["Θ", "Theta"],    #U+0398 (greek capital letter theta)
  ["Ι", "Iota"],   #U+0399 (greek capital letter iota)
  ["Κ", "Kappa"],    #U+039A (greek capital letter kappa)
  ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
  ["Λ", "Lamda"],    #U+039B (greek capital letter lambda)
  ["Μ", "Mu"],     #U+039C (greek capital letter mu)
  ["Ν", "Nu"],     #U+039D (greek capital letter nu)
  ["Ξ", "Xi"],     #U+039E (greek capital letter xi)
  ["Ο", "Omicron"],  #U+039F (greek capital letter omicron)
  ["Π", "Pi"],     #U+03A0 (greek capital letter pi)
  ["Ρ", "Rho"],      #U+03A1 (greek capital letter rho)
  ["Σ", "Sigma"],    #U+03A3 (greek capital letter sigma)
  ["Τ", "Tau"],      #U+03A4 (greek capital letter tau)
  ["Υ", "Upsilon"],  #U+03A5 (greek capital letter upsilon)
  ["Φ", "Phi"],      #U+03A6 (greek capital letter phi)
  ["Χ", "Chi"],      #U+03A7 (greek capital letter chi)
  ["Ψ", "Psi"],      #U+03A8 (greek capital letter Psi)
  ["Ω", "Omega"],    #U+03A9 (greek capital letter omega)

  ["ϕ", "phi"],      #U+03D5 (greek phi symbol)

  ["×", "x"],        #U+00D7 (multiplication sign)
  ["•", "*"],        #U+2022 (bullet)
  [" ", " "],        #U+2009 (thin space)
  [" ", " "],        #U+200A (hair space)
  [" ", " "],        #U+202F (narrow no-break space)
  [" ", " "],        #U+00A0 (Non-Breaking space)
  ["　", " "],        #U+3000 (ideographic space)
  ["‐", "-"],        #U+2010 (Hyphen)
  ["‑", "-"],        #U+2011 (Non-Breaking Hyphen)
  ["−", "-"],        #U+2212 (minus sign)
  ["–", "-"],        #U+2013 (en dash)
  ["′", "'"],        #U+2032 (prime)
  ["‘", "'"],        #U+2018 (left single quotation mark)
  ["’", "'"],        #U+2019 (right single quotation mark)
  ["“", '"'],        #U+201C (left double quotation mark)
  ["”", '"'],        #U+201D (right double quotation mark)
  ['"', "''"]
]

SIMILARITY_THRESHOLD = to work on the hash representation of denotations to assume that there is no bag representation to this method

0.7

MIN_LENGTH_FOR_APPROXIMATION = approximate the location of str1 in str2

Class Method Summary collapse

._find_divisions(_source, _targets) ⇒ Object
._find_divisions_old(source, targets) ⇒ Object
.approximate_fit(str1, str2) ⇒ Object

If finds an approximate region of str2 that contains str1.
.cdiff(str1, str2) ⇒ Object
.find_divisions(source, targets, mappings = []) ⇒ Object

It finds, among the targets, the right divisions for the taraget text to fit in.
.glcs_required?(str1, mappings = []) ⇒ Boolean
.sdiff2cdiff(sdiff) ⇒ Object

Class Method Details

._find_divisions(_source, _targets) ⇒ `Object`

# File 'lib/text_alignment/find_divisions.rb', line 35

def _find_divisions(_source, _targets)
  indice = []
  history = []
  cache = {}
  source = _source.dup
  targets = _targets.dup
  until source.strip.empty? || targets.empty?
    mode, cmp = nil, nil
    candidates = []
    targets.each_with_index do |target, i|
      if source.size < target[:text].size
        mode = :t_in_s
        str1 = source
        str2 = target[:text]
      else
        mode = :s_in_t
        str1 = target[:text]
        str2 = source
      end

      len1 = str1.length
      len2 = str2.length

      offset_begin, offset_end = if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)
        approximate_fit(str1, str2)
      else
        # the whole source
        [0, -1]
      end

      unless offset_begin.nil?
        key = str1 + ' _:_ ' + str2[offset_begin .. offset_end]
        cmp = if cache.has_key? key
          cache[key]
        else
          cmp = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
        end
        cache[key] = cmp

        if (cmp.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (cmp.str1_match_final - cmp.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
          candidates << {idx:i, offset:offset_begin, mode:mode, cmp:cmp}
        end
      end
    end

    # return remaining source and targets if m.nil?
    break if candidates.empty?

    choice = candidates.max{|a, b| a[:cmp].similarity <=> a[:cmp].similarity}
    m = choice[:idx]
    mode = choice[:mode]

    index = if mode == :t_in_s
      {divid:targets[m][:divid], region:[0, source.size]}
    else # :s_in_t
      cmp = choice[:cmp]
      offset = choice[:offset]
      {divid:targets[m][:divid], region:[cmp.str2_match_initial + offset, cmp.str2_match_final + offset + 1]}
    end

    source = source[0 ... index[:region][0]] + source[index[:region][1] .. -1]
    history << index[:region].dup

    before_begin = index[:region][0]
    before_end = index[:region][1]

    rhistory = history.reverse
    rhistory.shift
    rhistory.each do |h|
      gap = h[1] - h[0]
      index[:region][0] += gap if index[:region][0] >= h[0]
      index[:region][1] += gap if index[:region][1] >  h[0]
    end

    indice << index

    targets.delete_at(m)
  end

  unless source.strip.empty? && targets.empty?
    index = {divid:nil}
    index[:remaining_source] = source unless source.strip.empty?
    index[:remaining_targets] = targets.collect{|s| s[:divid]} unless targets.empty?
    indice << index
  end

  indice
end

._find_divisions_old(source, targets) ⇒ `Object`

# File 'lib/text_alignment/find_divisions.rb', line 124

def _find_divisions_old(source, targets)
  mode, m, c, offset_begin = nil, nil, nil, nil

  targets.each_with_index do |target, i|
    if source.size < target[:text].size
      mode = :t_in_s
      str1 = source
      str2 = target[:text]
    else
      mode = :s_in_t
      str1 = target[:text]
      str2 = source
    end

    len1 = str1.length
    len2 = str2.length

    offset_begin, offset_end = 0, -1
    offset_begin, offset_end = approximate_fit(str1, str2) if (len2 - len1) > len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD)

    unless offset_begin.nil?
      c = TextAlignment::LCSComparison.new(str1, str2[offset_begin .. offset_end])
      if (c.similarity > TextAlignment::SIMILARITY_THRESHOLD) && ((len1 - (c.str1_match_final - c.str1_match_initial + 1)) < len1 * (1 - TextAlignment::SIMILARITY_THRESHOLD))
        m = i
        break
      end
    end
  end

  # return remaining source and targets if m.nil?
  return [[-1, [source, targets.collect{|s| s[:divid]}]]] if m.nil?

  index = if mode == :t_in_s
    [targets[m][:divid], [0, source.size]]
  else # :s_in_t
    [targets[m][:divid], [c.str2_match_initial + offset_begin, c.str2_match_final + offset_begin + 1]]
  end

  next_source = source[0 ... index[1][0]] + source[index[1][1] .. -1]
  targets.delete_at(m)

  if next_source.strip.empty? || targets.empty?
    return [index]
  else
    more_index = _find_divisions(next_source, targets)
    gap = index[1][1] - index[1][0]
    more_index.each do |i|
      if (i[0] > -1)
        i[1][0] += gap if i[1][0] >= index[1][0]
        i[1][1] += gap if i[1][1] >  index[1][0]
      end
    end
    return [index] + more_index
  end
end

.approximate_fit(str1, str2) ⇒ `Object`

If finds an approximate region of str2 that contains str1

Raises:

(ArgumentError)

# File 'lib/text_alignment/approximate_fit.rb', line 13

def approximate_fit(str1, str2)
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION

  ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
  ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
  ngram_shared = ngram1 & ngram2

  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
  return nil, nil if ngram_shared.empty?

  signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
  return nil, nil if signature_ngrams.empty? #raise "no signature ngram"

  cache = {}
  fit_begin, fit_end = nil, nil
  signature_ngrams.each do |signature_ngram|
    loc_signature_ngram_in_str1 = str1.index(signature_ngram)
    loc_signature_ngram_in_str2 = str2.index(signature_ngram)

    # approximate the beginning of the fit
    fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
    fit_begin = 0 if fit_begin < 0

    # approximate the end of the fit
    offset_end = str1.length - loc_signature_ngram_in_str1
    fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
    fit_end = str2.length if fit_end > str2.length

    next if cache.has_key?("#{fit_begin}-#{fit_end}")
    text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
    cache["#{fit_begin}-#{fit_end}"] = text_similarity

    break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
    fit_begin, fit_end = nil, nil
  end
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
  return nil, nil
end

.cdiff(str1, str2) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/text_alignment/lcs_cdiff.rb', line 10

def cdiff(str1, str2)
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
  raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
  sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
end

.find_divisions(source, targets, mappings = []) ⇒ `Object`

It finds, among the targets, the right divisions for the taraget text to fit in.

Raises:

(ArgumentError)

# File 'lib/text_alignment/find_divisions.rb', line 15

def find_divisions(source, targets, mappings = [])
  raise ArgumentError, "nil source"           if source == nil
  raise ArgumentError, "nil or empty targets" if targets == nil || targets.empty?
  raise ArgumentError, "nil mappings"         if mappings == nil

  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  source.tr!(characters_from, characters_to)
  targets.each{|target| target[:text].tr!(characters_from, characters_to)}

  # to process smaller ones first
  targets.sort!{|s1, s2| s1[:text].size <=> s2[:text].size}

  TextAlignment._find_divisions(source, targets)
end

.glcs_required?(str1, mappings = []) ⇒ `Boolean`

Returns:

(Boolean)

Raises:

(ArgumentError)

# File 'lib/text_alignment/glcs_required.rb', line 5

def glcs_required?(str1, mappings = [])
  raise ArgumentError, "nil string" if str1.nil?
  raise ArgumentError, "nil mappings" if mappings.nil?

  # character mappings can be safely applied to the strings withoug changing the position of other characters
  character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
  characters_from = character_mappings.collect{|m| m[0]}.join
  characters_to   = character_mappings.collect{|m| m[1]}.join
  characters_to.gsub!(/-/, '\-')

  str1.tr!(characters_from, characters_to)

  str1 =~/([^\p{ASCII}][^\p{ASCII}])/
  $1
end

.sdiff2cdiff(sdiff) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/text_alignment/lcs_cdiff.rb', line 16

def sdiff2cdiff (sdiff)
  raise ArgumentError, "nil sdiff" if sdiff.nil?

  cdiff_str1, cdiff_str2 = '', ''

  sdiff.each do |h|
    case h.action
    when '='
      cdiff_str1 += h.old_element
      cdiff_str2 += h.new_element
    when '!'
      cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
      cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
    when '-'
      cdiff_str1 += h.old_element
      cdiff_str2 += TextAlignment::NIL_CHARACTER
    when '+'
      cdiff_str1 += TextAlignment::NIL_CHARACTER
      cdiff_str2 += h.new_element
    end
  end

  cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
end

Module: TextAlignment

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

._find_divisions(_source, _targets) ⇒ Object

._find_divisions_old(source, targets) ⇒ Object

.approximate_fit(str1, str2) ⇒ Object

.cdiff(str1, str2) ⇒ Object

.find_divisions(source, targets, mappings = []) ⇒ Object

.glcs_required?(str1, mappings = []) ⇒ Boolean

.sdiff2cdiff(sdiff) ⇒ Object

._find_divisions(_source, _targets) ⇒ `Object`

._find_divisions_old(source, targets) ⇒ `Object`

.approximate_fit(str1, str2) ⇒ `Object`

.cdiff(str1, str2) ⇒ `Object`

.find_divisions(source, targets, mappings = []) ⇒ `Object`

.glcs_required?(str1, mappings = []) ⇒ `Boolean`

.sdiff2cdiff(sdiff) ⇒ `Object`