Class: Embulk::Guess::CharsetGuessPlugin

Inherits:
Embulk::GuessPlugin show all
Defined in:
lib/embulk/guess/charset.rb

Instance Method Summary collapse

Methods inherited from Embulk::GuessPlugin

from_java, new_java

Instance Method Details

#guess(config, sample_buffer) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/embulk/guess/charset.rb', line 7

def guess(config, sample_buffer)
  # ICU4J
  detector = com.ibm.icu.text.CharsetDetector.new
  detector.setText(sample_buffer.to_java_bytes)
  best_match = detector.detect
  if best_match.getConfidence < 50
    name = "UTF-8"
  else
    name = best_match.getName
    if name == "ISO-8859-1"
      # ISO-8859-1 means ASCII which is a subset
      # of UTF-8 in most of cases due to lack of
      # sample data set
      name = "UTF-8"
    end
  end
  return {"parser" => {"charset" => name}}
end