Module: TableData::Detection
- Defined in:
- lib/tabledata/detection.rb
Constant Summary collapse
- UnlikelyCharsWin1252 =
"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD" \ "\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB" \ "\xBC\xBD\xBE\xBF\xD7\xF7"
- UnlikelyCharsIso8859_1 =
""- UnlikelyCharsMacRoman =
""- UmlautsMac =
"äöü".encode(Encoding::MacRoman).force_encoding(Encoding::BINARY)
- UmlautsWin =
"äöü".encode(Encoding::Windows_1252).force_encoding(Encoding::BINARY)
- DiacritsMac =
"âàéèô".encode(Encoding::MacRoman).force_encoding(Encoding::BINARY)
- DiacritsWin =
"âàéèô".encode(Encoding::Windows_1252).force_encoding(Encoding::BINARY)
Class Method Summary collapse
- .file_type_from_path(path) ⇒ Object
- .force_guessed_encoding!(string) ⇒ Object
- .guess_csv_delimiter(csv, out_of = [',',';']) ⇒ Object
- .guess_encoding(string) ⇒ Object
Class Method Details
.file_type_from_path(path) ⇒ Object
67 68 69 70 71 72 73 74 |
# File 'lib/tabledata/detection.rb', line 67 def file_type_from_path(path) case path when /\.csv$/ then :csv when /\.xls$/ then :xls when /\.xlsx$/ then :xlsx else raise InvalidFileType, "Unknown file format for path #{path.inspect}" end end |
.force_guessed_encoding!(string) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/tabledata/detection.rb', line 25 def force_guessed_encoding!(string) return string if string.force_encoding(Encoding::UTF_8).valid_encoding? string.force_encoding(Encoding::BINARY) # check for non-mapped codepoints possible_encodings = [Encoding::Windows_1252, Encoding::ISO8859_15, Encoding::MacRoman] possible_encodings.delete(Encoding::ISO8859_15) if string =~ /[\x80-\x9f]/n possible_encodings.delete(Encoding::Windows_1252) if string =~ /[\x81\x8D\x8F\x90\x9D]/n return string.force_encoding(possible_encodings.first) if possible_encodings.size == 1 # # check for occurrences of characters with weighted expectancy # # e.g. a "§" is quite unlikely # win = string[0,10_000].count(UnlikelyCharsWin1252) # iso = string[0,10_000].count(UnlikelyCharsIso8859_1) # mac = string[0,10_000].count(UnlikelyCharsMacRoman) # Check occurrences of äöü case string[0,10_000].count(UmlautsMac) <=> string[0,10_000].count(UmlautsWin) when -1 then return string.force_encoding(Encoding::Windows_1252) when 1 then return string.force_encoding(Encoding::MacRoman) end # Check occurrences of âàéèô case string[0,10_000].count(DiacritsMac) <=> string[0,10_000].count(DiacritsWin) when -1 then return string.force_encoding(Encoding::Windows_1252) when 1 then return string.force_encoding(Encoding::MacRoman) end # Bias for Windows_1252 string.force_encoding(Encoding::Windows_1252) end |
.guess_csv_delimiter(csv, out_of = [',',';']) ⇒ Object
61 62 63 64 65 |
# File 'lib/tabledata/detection.rb', line 61 def guess_csv_delimiter(csv, out_of=[',',';']) out_of = out_of.map { |delimiter| delimiter.encode(csv.encoding) } out_of.max_by { |delimiter| csv[0, 10_000].count(delimiter) } end |
.guess_encoding(string) ⇒ Object
57 58 59 |
# File 'lib/tabledata/detection.rb', line 57 def guess_encoding(string) force_guessed_encoding!(string.dup).encoding end |