Module: PROIEL::Alignment::Builder

Defined in:
lib/proiel/alignment/builder.rb

Class Method Summary collapse

Class Method Details

.compute_matrix(alignment, source, blacklist = [], log_directory = nil) ⇒ Object

This computes a matrix of original and translation sentences that are aligned. For now, this function does not handle translation sentences that are unaligned (this is tricky to handle robustly!). As the current treebank collection stands this is an issue that should not arise so this is for now a reasonable approximation.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/proiel/alignment/builder.rb', line 9

def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
  matrix1 = group_backwards(alignment, source, blacklist)
  raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)

  matrix2 = group_forwards(alignment, source, blacklist)
  raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)

  if log_directory
    # Verify that both texts are still in the correct sequence
    File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
      matrix1.map do |x|
        f.puts x.inspect
      end
    end

    File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
      matrix2.map do |x|
        f.puts x.inspect
      end
    end
  end

  matrix = []
  iter1 = { i: 0, m: matrix1 }
  iter2 = { i: 0, m: matrix2 }

  loop do
    # Take from matrix1 unless we have a translation
    while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
      matrix << iter1[:m][iter1[:i]]
      iter1[:i] += 1
    end

    # Take from matrix2 unless we have an original
    while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
      matrix << iter2[:m][iter2[:i]]
      iter2[:i] += 1
    end

    if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
      # Now the two should match provided alignments are sorted the same way,
      # so take one from each. If they don't match outright, we may have a case
      # of swapped sentence orders or a gap (one sentence unaligned in one of
      # the texts surrounded by two sentences that are aligned to the same
      # sentence in the other text). We'll try to repair this by merging bits
      # from the next row in various combinations.
      #
      # When adding to the new mateix, pick original from matrix1 and
      # translation from matrix2 so that the original textual order is
      # preserved
      if repair(matrix, iter1, 0, iter2, 0) or

         repair(matrix, iter1, 1, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 1) or

         repair(matrix, iter1, 2, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 2) or

         repair(matrix, iter1, 3, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 3) or

         repair(matrix, iter1, 4, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 4)
      else
        STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
        STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
        raise
      end
    else
      raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
      break
    end
  end

  if log_directory
    File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
      matrix.map do |x|
        f.puts x.inspect
      end
    end
  end

  raise unless matrix.map { |r| r[:original]    }.flatten.compact == alignment.sentences.map(&:id)
  raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)

  matrix
end