Class: EncodingSampler::Sampler

Inherits:
Object
  • Object
show all
Defined in:
lib/encoding_sampler/sampler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#filenameString (readonly)

Full name of the target file used to create the sample.

Returns:

  • (String)


9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/encoding_sampler/sampler.rb', line 9

class Sampler
  
  # Full name of the target file used to create the sample.
  # @return [String]    
  attr_reader :filename

  # Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file.
  # @example When ISO-8859-1 and ISO-8859-2 decode the target file in exactly the same way, but unlike ISO-8859-15, 
  #   [["ISO-8859-1", 'ISO-8859-2'], ["ISO-8859-15"]]
  # @return [Array]
  attr_reader :unique_valid_encoding_groups

  # Attribute renamed for clarity.
  # @deprecated Use {#unique_valid_encoding_groups} instead.
  def unique_valid_encodings
    unique_valid_encoding_groups
  end

  # All valid encodings.
  # @return [Array] Names of encodings that return valid results for the entire file. 
  def valid_encodings
    unique_valid_encoding_groups.flatten
  end
  
  # Sample file lines, decoded by _encoding_.
  # @return [Array]
  def sample(encoding)
    @binary_samples.values.map {|line| decode_binary_string(line, encoding)}
  end
  
  # Returns a hash of samples, keyed by encoding
  # @return [Hash]
  def samples(encodings = valid_encodings)
    encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)}
  end
  
  # Returns all the "best" encodings. Assumes shortest strings are most likely to be correct. 
  # @return [Array]
  def best_encodings
    candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
    min_length = candidates.values.collect {|ary| ary.join('').size}.min
    candidates.keys.select {|key| candidates[key].join('').size == min_length}
  end
  
  # Multiple encodings often return the exact same decoded sample.
  # Return only unique samples, keyed on the first encoding to return each sample.
  # What's first in each grouping is based on original order of encodings give to the constructor.
  # @return [Array]
  def unique_samples
    samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
  end
  
  # Decoded sample, diffed against __all__ of the samples, and marked up to show differences.
  # @param [String] encoding
  # @return [String]
  def diffed_sample(encoding)
    diffed_encoded_samples[encoding]
  end
  
  def diffed_samples(encodings = valid_encodings)
    encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)}
  end

  # @ (see #unique_samples) Samples are diffed 
  def unique_diffed_samples
    diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
  end

private
 
  def initialize(file_name, encodings, diff_options = {})
    @diff_options = diff_options
    @filename = file_name.freeze
    @unique_valid_encoding_groups, @binary_samples, solutions = [], {}, {}

    solutions = {}
    encodings.sort.combination(2).to_a.each {|pair| solutions[pair] = nil}
    
    # read the entire file to verify encodings and collect samples for comparison of encodings
    File.open(@filename, 'rb') do |file|
      until file.eof?
        binary_line = file.readline.strip
        decoded_lines = multi_decode_binary_string(binary_line, encodings)

        # eliminate any newly-invalid encodings from the scope
        decoded_lines.select {|encoding, decoded_line| decoded_line.nil?}.keys.each do |invalid_encoding|
          encodings.delete invalid_encoding
          solutions.delete_if {|pair, lineno| pair.include? invalid_encoding}
          @binary_samples.keep_if {|id, string| solutions.keys.flatten.include? id}
        end        
        
        # add sample to solutions when binary string decodes differently for any two previously-undifferentiated encodings
        solutions.select {|pair, lineno| lineno.nil?}.keys.each do |unsolved_pair|
          solutions[unsolved_pair], @binary_samples[file.lineno] = file.lineno, binary_line if decoded_lines[unsolved_pair[0]] != decoded_lines[unsolved_pair[1]]
        end
      end
    end
    
    # group undifferentiated encodings
    (solutions.select {|pair, lineno| lineno.nil?}.keys + encodings.collect {|encoding| [encoding]}).each do |subgroup|
      group_index = @unique_valid_encoding_groups.index {|group| !(group & subgroup).empty?}
      group_index ? @unique_valid_encoding_groups[group_index] |= subgroup : @unique_valid_encoding_groups << subgroup
    end   
    
    @unique_valid_encoding_groups = @unique_valid_encoding_groups.each {|group| group.freeze}.freeze
    @binary_samples.freeze
  end
  
  def decode_binary_string(binary_string, encoding)
    encoded_string = binary_string.dup.force_encoding(encoding)
    encoded_string.valid_encoding? ? encoded_string.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') : nil
  end
  
  def multi_decode_binary_string(binary_string, encodings)
    decoded_lines = {}
    encodings.each {|encoding| decoded_lines[encoding] = decode_binary_string(binary_string, encoding)}
    decoded_lines
  end
  
  def diffed_strings(array_of_strings)
    lcs = array_of_strings.inject {|intermediate_lcs, string| Diff::LCS.LCS(intermediate_lcs, string).join }
    callbacks = DiffCallbacks.new(diff_output = '', @diff_options)
    array_of_strings.map do |string| 
      diff_output.clear
      Diff::LCS.traverse_sequences(lcs, string, callbacks)
      diff_output.dup
    end
  end

  def diffed_encoded_samples
    return @diffed_encoded_samples if @diffed_encoded_samples
    
    encodings = valid_encodings.freeze
    decoded_samples = samples(encodings)
    @diffed_encoded_samples = encodings.inject({}) {|hash, key| hash.merge! key => []}
    
    @binary_samples.values.each_index do |i|
      decoded_lines = encodings.map {|encoding| decoded_samples[encoding][i]}
      diffed_encoded_lines = diffed_strings(decoded_lines)
      encodings.each_index {|j| @diffed_encoded_samples[encodings[j]] << diffed_encoded_lines[j] }
    end
    
    @diffed_encoded_samples.freeze
  end

end

#unique_valid_encoding_groupsArray (readonly)

Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file.

Examples:

When ISO-8859-1 and ISO-8859-2 decode the target file in exactly the same way, but unlike ISO-8859-15,

[["ISO-8859-1", 'ISO-8859-2'], ["ISO-8859-15"]]

Returns:

  • (Array)


9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/encoding_sampler/sampler.rb', line 9

class Sampler
  
  # Full name of the target file used to create the sample.
  # @return [String]    
  attr_reader :filename

  # Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file.
  # @example When ISO-8859-1 and ISO-8859-2 decode the target file in exactly the same way, but unlike ISO-8859-15, 
  #   [["ISO-8859-1", 'ISO-8859-2'], ["ISO-8859-15"]]
  # @return [Array]
  attr_reader :unique_valid_encoding_groups

  # Attribute renamed for clarity.
  # @deprecated Use {#unique_valid_encoding_groups} instead.
  def unique_valid_encodings
    unique_valid_encoding_groups
  end

  # All valid encodings.
  # @return [Array] Names of encodings that return valid results for the entire file. 
  def valid_encodings
    unique_valid_encoding_groups.flatten
  end
  
  # Sample file lines, decoded by _encoding_.
  # @return [Array]
  def sample(encoding)
    @binary_samples.values.map {|line| decode_binary_string(line, encoding)}
  end
  
  # Returns a hash of samples, keyed by encoding
  # @return [Hash]
  def samples(encodings = valid_encodings)
    encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)}
  end
  
  # Returns all the "best" encodings. Assumes shortest strings are most likely to be correct. 
  # @return [Array]
  def best_encodings
    candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
    min_length = candidates.values.collect {|ary| ary.join('').size}.min
    candidates.keys.select {|key| candidates[key].join('').size == min_length}
  end
  
  # Multiple encodings often return the exact same decoded sample.
  # Return only unique samples, keyed on the first encoding to return each sample.
  # What's first in each grouping is based on original order of encodings give to the constructor.
  # @return [Array]
  def unique_samples
    samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
  end
  
  # Decoded sample, diffed against __all__ of the samples, and marked up to show differences.
  # @param [String] encoding
  # @return [String]
  def diffed_sample(encoding)
    diffed_encoded_samples[encoding]
  end
  
  def diffed_samples(encodings = valid_encodings)
    encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)}
  end

  # @ (see #unique_samples) Samples are diffed 
  def unique_diffed_samples
    diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
  end

private
 
  def initialize(file_name, encodings, diff_options = {})
    @diff_options = diff_options
    @filename = file_name.freeze
    @unique_valid_encoding_groups, @binary_samples, solutions = [], {}, {}

    solutions = {}
    encodings.sort.combination(2).to_a.each {|pair| solutions[pair] = nil}
    
    # read the entire file to verify encodings and collect samples for comparison of encodings
    File.open(@filename, 'rb') do |file|
      until file.eof?
        binary_line = file.readline.strip
        decoded_lines = multi_decode_binary_string(binary_line, encodings)

        # eliminate any newly-invalid encodings from the scope
        decoded_lines.select {|encoding, decoded_line| decoded_line.nil?}.keys.each do |invalid_encoding|
          encodings.delete invalid_encoding
          solutions.delete_if {|pair, lineno| pair.include? invalid_encoding}
          @binary_samples.keep_if {|id, string| solutions.keys.flatten.include? id}
        end        
        
        # add sample to solutions when binary string decodes differently for any two previously-undifferentiated encodings
        solutions.select {|pair, lineno| lineno.nil?}.keys.each do |unsolved_pair|
          solutions[unsolved_pair], @binary_samples[file.lineno] = file.lineno, binary_line if decoded_lines[unsolved_pair[0]] != decoded_lines[unsolved_pair[1]]
        end
      end
    end
    
    # group undifferentiated encodings
    (solutions.select {|pair, lineno| lineno.nil?}.keys + encodings.collect {|encoding| [encoding]}).each do |subgroup|
      group_index = @unique_valid_encoding_groups.index {|group| !(group & subgroup).empty?}
      group_index ? @unique_valid_encoding_groups[group_index] |= subgroup : @unique_valid_encoding_groups << subgroup
    end   
    
    @unique_valid_encoding_groups = @unique_valid_encoding_groups.each {|group| group.freeze}.freeze
    @binary_samples.freeze
  end
  
  def decode_binary_string(binary_string, encoding)
    encoded_string = binary_string.dup.force_encoding(encoding)
    encoded_string.valid_encoding? ? encoded_string.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') : nil
  end
  
  def multi_decode_binary_string(binary_string, encodings)
    decoded_lines = {}
    encodings.each {|encoding| decoded_lines[encoding] = decode_binary_string(binary_string, encoding)}
    decoded_lines
  end
  
  def diffed_strings(array_of_strings)
    lcs = array_of_strings.inject {|intermediate_lcs, string| Diff::LCS.LCS(intermediate_lcs, string).join }
    callbacks = DiffCallbacks.new(diff_output = '', @diff_options)
    array_of_strings.map do |string| 
      diff_output.clear
      Diff::LCS.traverse_sequences(lcs, string, callbacks)
      diff_output.dup
    end
  end

  def diffed_encoded_samples
    return @diffed_encoded_samples if @diffed_encoded_samples
    
    encodings = valid_encodings.freeze
    decoded_samples = samples(encodings)
    @diffed_encoded_samples = encodings.inject({}) {|hash, key| hash.merge! key => []}
    
    @binary_samples.values.each_index do |i|
      decoded_lines = encodings.map {|encoding| decoded_samples[encoding][i]}
      diffed_encoded_lines = diffed_strings(decoded_lines)
      encodings.each_index {|j| @diffed_encoded_samples[encodings[j]] << diffed_encoded_lines[j] }
    end
    
    @diffed_encoded_samples.freeze
  end

end

Instance Method Details

#best_encodingsArray

Returns all the "best" encodings. Assumes shortest strings are most likely to be correct.

Returns:

  • (Array)


47
48
49
50
51
# File 'lib/encoding_sampler/sampler.rb', line 47

def best_encodings
  candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
  min_length = candidates.values.collect {|ary| ary.join('').size}.min
  candidates.keys.select {|key| candidates[key].join('').size == min_length}
end

#diffed_sample(encoding) ⇒ String

Decoded sample, diffed against all of the samples, and marked up to show differences.

Parameters:

  • encoding (String)

Returns:

  • (String)


64
65
66
# File 'lib/encoding_sampler/sampler.rb', line 64

def diffed_sample(encoding)
  diffed_encoded_samples[encoding]
end

#diffed_samples(encodings = valid_encodings) ⇒ Object



68
69
70
# File 'lib/encoding_sampler/sampler.rb', line 68

def diffed_samples(encodings = valid_encodings)
  encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)}
end

#sample(encoding) ⇒ Array

Sample file lines, decoded by encoding.

Returns:

  • (Array)


35
36
37
# File 'lib/encoding_sampler/sampler.rb', line 35

def sample(encoding)
  @binary_samples.values.map {|line| decode_binary_string(line, encoding)}
end

#samples(encodings = valid_encodings) ⇒ Hash

Returns a hash of samples, keyed by encoding

Returns:

  • (Hash)


41
42
43
# File 'lib/encoding_sampler/sampler.rb', line 41

def samples(encodings = valid_encodings)
  encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)}
end

#unique_diffed_samplesObject

@ (see #unique_samples) Samples are diffed



73
74
75
# File 'lib/encoding_sampler/sampler.rb', line 73

def unique_diffed_samples
  diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
end

#unique_samplesArray

Multiple encodings often return the exact same decoded sample. Return only unique samples, keyed on the first encoding to return each sample. What's first in each grouping is based on original order of encodings give to the constructor.

Returns:

  • (Array)


57
58
59
# File 'lib/encoding_sampler/sampler.rb', line 57

def unique_samples
  samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first})
end

#unique_valid_encodingsObject

Deprecated.

Attribute renamed for clarity.



23
24
25
# File 'lib/encoding_sampler/sampler.rb', line 23

def unique_valid_encodings
  unique_valid_encoding_groups
end

#valid_encodingsArray

All valid encodings.

Returns:

  • (Array)

    Names of encodings that return valid results for the entire file.



29
30
31
# File 'lib/encoding_sampler/sampler.rb', line 29

def valid_encodings
  unique_valid_encoding_groups.flatten
end