Class: EncodingSampler::Sampler
- Inherits:
-
Object
- Object
- EncodingSampler::Sampler
- Defined in:
- lib/encoding_sampler/sampler.rb
Instance Attribute Summary collapse
-
#filename ⇒ String
readonly
Full name of the target file used to create the sample.
-
#unique_valid_encoding_groups ⇒ Array
readonly
Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file.
Instance Method Summary collapse
-
#best_encodings ⇒ Array
Returns all the "best" encodings.
-
#diffed_sample(encoding) ⇒ String
Decoded sample, diffed against all of the samples, and marked up to show differences.
- #diffed_samples(encodings = valid_encodings) ⇒ Object
-
#sample(encoding) ⇒ Array
Sample file lines, decoded by encoding.
-
#samples(encodings = valid_encodings) ⇒ Hash
Returns a hash of samples, keyed by encoding.
-
#unique_diffed_samples ⇒ Object
@ (see #unique_samples) Samples are diffed.
-
#unique_samples ⇒ Array
Multiple encodings often return the exact same decoded sample.
-
#unique_valid_encodings ⇒ Object
deprecated
Deprecated.
Use #unique_valid_encoding_groups instead.
-
#valid_encodings ⇒ Array
All valid encodings.
Instance Attribute Details
#filename ⇒ String (readonly)
Full name of the target file used to create the sample.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/encoding_sampler/sampler.rb', line 9 class Sampler # Full name of the target file used to create the sample. # @return [String] attr_reader :filename # Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file. # @example When ISO-8859-1 and ISO-8859-2 decode the target file in exactly the same way, but unlike ISO-8859-15, # [["ISO-8859-1", 'ISO-8859-2'], ["ISO-8859-15"]] # @return [Array] attr_reader :unique_valid_encoding_groups # Attribute renamed for clarity. # @deprecated Use {#unique_valid_encoding_groups} instead. def unique_valid_encodings unique_valid_encoding_groups end # All valid encodings. # @return [Array] Names of encodings that return valid results for the entire file. def valid_encodings unique_valid_encoding_groups.flatten end # Sample file lines, decoded by _encoding_. # @return [Array] def sample(encoding) @binary_samples.values.map {|line| decode_binary_string(line, encoding)} end # Returns a hash of samples, keyed by encoding # @return [Hash] def samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)} end # Returns all the "best" encodings. Assumes shortest strings are most likely to be correct. # @return [Array] def best_encodings candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) min_length = candidates.values.collect {|ary| ary.join('').size}.min candidates.keys.select {|key| candidates[key].join('').size == min_length} end # Multiple encodings often return the exact same decoded sample. # Return only unique samples, keyed on the first encoding to return each sample. # What's first in each grouping is based on original order of encodings give to the constructor. # @return [Array] def unique_samples samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end # Decoded sample, diffed against __all__ of the samples, and marked up to show differences. # @param [String] encoding # @return [String] def diffed_sample(encoding) diffed_encoded_samples[encoding] end def diffed_samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)} end # @ (see #unique_samples) Samples are diffed def unique_diffed_samples diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end private def initialize(file_name, encodings, = {}) @diff_options = @filename = file_name.freeze @unique_valid_encoding_groups, @binary_samples, solutions = [], {}, {} solutions = {} encodings.sort.combination(2).to_a.each {|pair| solutions[pair] = nil} # read the entire file to verify encodings and collect samples for comparison of encodings File.open(@filename, 'rb') do |file| until file.eof? binary_line = file.readline.strip decoded_lines = multi_decode_binary_string(binary_line, encodings) # eliminate any newly-invalid encodings from the scope decoded_lines.select {|encoding, decoded_line| decoded_line.nil?}.keys.each do |invalid_encoding| encodings.delete invalid_encoding solutions.delete_if {|pair, lineno| pair.include? invalid_encoding} @binary_samples.keep_if {|id, string| solutions.keys.flatten.include? id} end # add sample to solutions when binary string decodes differently for any two previously-undifferentiated encodings solutions.select {|pair, lineno| lineno.nil?}.keys.each do |unsolved_pair| solutions[unsolved_pair], @binary_samples[file.lineno] = file.lineno, binary_line if decoded_lines[unsolved_pair[0]] != decoded_lines[unsolved_pair[1]] end end end # group undifferentiated encodings (solutions.select {|pair, lineno| lineno.nil?}.keys + encodings.collect {|encoding| [encoding]}).each do |subgroup| group_index = @unique_valid_encoding_groups.index {|group| !(group & subgroup).empty?} group_index ? @unique_valid_encoding_groups[group_index] |= subgroup : @unique_valid_encoding_groups << subgroup end @unique_valid_encoding_groups = @unique_valid_encoding_groups.each {|group| group.freeze}.freeze @binary_samples.freeze end def decode_binary_string(binary_string, encoding) encoded_string = binary_string.dup.force_encoding(encoding) encoded_string.valid_encoding? ? encoded_string.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') : nil end def multi_decode_binary_string(binary_string, encodings) decoded_lines = {} encodings.each {|encoding| decoded_lines[encoding] = decode_binary_string(binary_string, encoding)} decoded_lines end def diffed_strings(array_of_strings) lcs = array_of_strings.inject {|intermediate_lcs, string| Diff::LCS.LCS(intermediate_lcs, string).join } callbacks = DiffCallbacks.new(diff_output = '', @diff_options) array_of_strings.map do |string| diff_output.clear Diff::LCS.traverse_sequences(lcs, string, callbacks) diff_output.dup end end def diffed_encoded_samples return @diffed_encoded_samples if @diffed_encoded_samples encodings = valid_encodings.freeze decoded_samples = samples(encodings) @diffed_encoded_samples = encodings.inject({}) {|hash, key| hash.merge! key => []} @binary_samples.values.each_index do |i| decoded_lines = encodings.map {|encoding| decoded_samples[encoding][i]} diffed_encoded_lines = diffed_strings(decoded_lines) encodings.each_index {|j| @diffed_encoded_samples[encodings[j]] << diffed_encoded_lines[j] } end @diffed_encoded_samples.freeze end end |
#unique_valid_encoding_groups ⇒ Array (readonly)
Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/encoding_sampler/sampler.rb', line 9 class Sampler # Full name of the target file used to create the sample. # @return [String] attr_reader :filename # Groups of valid encoding names, such that the encodings in a group all result in the same decoding for the target file. # @example When ISO-8859-1 and ISO-8859-2 decode the target file in exactly the same way, but unlike ISO-8859-15, # [["ISO-8859-1", 'ISO-8859-2'], ["ISO-8859-15"]] # @return [Array] attr_reader :unique_valid_encoding_groups # Attribute renamed for clarity. # @deprecated Use {#unique_valid_encoding_groups} instead. def unique_valid_encodings unique_valid_encoding_groups end # All valid encodings. # @return [Array] Names of encodings that return valid results for the entire file. def valid_encodings unique_valid_encoding_groups.flatten end # Sample file lines, decoded by _encoding_. # @return [Array] def sample(encoding) @binary_samples.values.map {|line| decode_binary_string(line, encoding)} end # Returns a hash of samples, keyed by encoding # @return [Hash] def samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)} end # Returns all the "best" encodings. Assumes shortest strings are most likely to be correct. # @return [Array] def best_encodings candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) min_length = candidates.values.collect {|ary| ary.join('').size}.min candidates.keys.select {|key| candidates[key].join('').size == min_length} end # Multiple encodings often return the exact same decoded sample. # Return only unique samples, keyed on the first encoding to return each sample. # What's first in each grouping is based on original order of encodings give to the constructor. # @return [Array] def unique_samples samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end # Decoded sample, diffed against __all__ of the samples, and marked up to show differences. # @param [String] encoding # @return [String] def diffed_sample(encoding) diffed_encoded_samples[encoding] end def diffed_samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)} end # @ (see #unique_samples) Samples are diffed def unique_diffed_samples diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end private def initialize(file_name, encodings, = {}) @diff_options = @filename = file_name.freeze @unique_valid_encoding_groups, @binary_samples, solutions = [], {}, {} solutions = {} encodings.sort.combination(2).to_a.each {|pair| solutions[pair] = nil} # read the entire file to verify encodings and collect samples for comparison of encodings File.open(@filename, 'rb') do |file| until file.eof? binary_line = file.readline.strip decoded_lines = multi_decode_binary_string(binary_line, encodings) # eliminate any newly-invalid encodings from the scope decoded_lines.select {|encoding, decoded_line| decoded_line.nil?}.keys.each do |invalid_encoding| encodings.delete invalid_encoding solutions.delete_if {|pair, lineno| pair.include? invalid_encoding} @binary_samples.keep_if {|id, string| solutions.keys.flatten.include? id} end # add sample to solutions when binary string decodes differently for any two previously-undifferentiated encodings solutions.select {|pair, lineno| lineno.nil?}.keys.each do |unsolved_pair| solutions[unsolved_pair], @binary_samples[file.lineno] = file.lineno, binary_line if decoded_lines[unsolved_pair[0]] != decoded_lines[unsolved_pair[1]] end end end # group undifferentiated encodings (solutions.select {|pair, lineno| lineno.nil?}.keys + encodings.collect {|encoding| [encoding]}).each do |subgroup| group_index = @unique_valid_encoding_groups.index {|group| !(group & subgroup).empty?} group_index ? @unique_valid_encoding_groups[group_index] |= subgroup : @unique_valid_encoding_groups << subgroup end @unique_valid_encoding_groups = @unique_valid_encoding_groups.each {|group| group.freeze}.freeze @binary_samples.freeze end def decode_binary_string(binary_string, encoding) encoded_string = binary_string.dup.force_encoding(encoding) encoded_string.valid_encoding? ? encoded_string.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') : nil end def multi_decode_binary_string(binary_string, encodings) decoded_lines = {} encodings.each {|encoding| decoded_lines[encoding] = decode_binary_string(binary_string, encoding)} decoded_lines end def diffed_strings(array_of_strings) lcs = array_of_strings.inject {|intermediate_lcs, string| Diff::LCS.LCS(intermediate_lcs, string).join } callbacks = DiffCallbacks.new(diff_output = '', @diff_options) array_of_strings.map do |string| diff_output.clear Diff::LCS.traverse_sequences(lcs, string, callbacks) diff_output.dup end end def diffed_encoded_samples return @diffed_encoded_samples if @diffed_encoded_samples encodings = valid_encodings.freeze decoded_samples = samples(encodings) @diffed_encoded_samples = encodings.inject({}) {|hash, key| hash.merge! key => []} @binary_samples.values.each_index do |i| decoded_lines = encodings.map {|encoding| decoded_samples[encoding][i]} diffed_encoded_lines = diffed_strings(decoded_lines) encodings.each_index {|j| @diffed_encoded_samples[encodings[j]] << diffed_encoded_lines[j] } end @diffed_encoded_samples.freeze end end |
Instance Method Details
#best_encodings ⇒ Array
Returns all the "best" encodings. Assumes shortest strings are most likely to be correct.
47 48 49 50 51 |
# File 'lib/encoding_sampler/sampler.rb', line 47 def best_encodings candidates = samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) min_length = candidates.values.collect {|ary| ary.join('').size}.min candidates.keys.select {|key| candidates[key].join('').size == min_length} end |
#diffed_sample(encoding) ⇒ String
Decoded sample, diffed against all of the samples, and marked up to show differences.
64 65 66 |
# File 'lib/encoding_sampler/sampler.rb', line 64 def diffed_sample(encoding) diffed_encoded_samples[encoding] end |
#diffed_samples(encodings = valid_encodings) ⇒ Object
68 69 70 |
# File 'lib/encoding_sampler/sampler.rb', line 68 def diffed_samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => diffed_sample(encoding)} end |
#sample(encoding) ⇒ Array
Sample file lines, decoded by encoding.
35 36 37 |
# File 'lib/encoding_sampler/sampler.rb', line 35 def sample(encoding) @binary_samples.values.map {|line| decode_binary_string(line, encoding)} end |
#samples(encodings = valid_encodings) ⇒ Hash
Returns a hash of samples, keyed by encoding
41 42 43 |
# File 'lib/encoding_sampler/sampler.rb', line 41 def samples(encodings = valid_encodings) encodings.inject({}) {|hash, encoding| hash.merge! encoding => sample(encoding)} end |
#unique_diffed_samples ⇒ Object
@ (see #unique_samples) Samples are diffed
73 74 75 |
# File 'lib/encoding_sampler/sampler.rb', line 73 def unique_diffed_samples diffed_samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end |
#unique_samples ⇒ Array
Multiple encodings often return the exact same decoded sample. Return only unique samples, keyed on the first encoding to return each sample. What's first in each grouping is based on original order of encodings give to the constructor.
57 58 59 |
# File 'lib/encoding_sampler/sampler.rb', line 57 def unique_samples samples(unique_valid_encoding_groups.collect {|encoding_group| encoding_group.first}) end |
#unique_valid_encodings ⇒ Object
Use #unique_valid_encoding_groups instead.
Attribute renamed for clarity.
23 24 25 |
# File 'lib/encoding_sampler/sampler.rb', line 23 def unique_valid_encodings unique_valid_encoding_groups end |
#valid_encodings ⇒ Array
All valid encodings.
29 30 31 |
# File 'lib/encoding_sampler/sampler.rb', line 29 def valid_encodings unique_valid_encoding_groups.flatten end |