Class: TbxImporter::Tbx

Inherits:
Object
  • Object
show all
Defined in:
lib/tbx_importer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path:, **args) ⇒ Tbx



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/tbx_importer.rb', line 10

def initialize(file_path:, **args)
  @file_path = file_path
  @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
  if args[:encoding].nil?
    @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
    if @encoding.nil?
      encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
      if encoding_in_file.eql?('UTF-8')
        @encoding = ('UTF-8')
      elsif encoding_in_file.eql?('UTF-16')
        @encoding = ('UTF-16LE')
      end
    end
  else
    @encoding = args[:encoding].upcase
  end
  @doc = {
    source_language: "",
    tc: { id: "", counter: 0, vals: [], lang: "", definition: "" },
    term: { lang: "", counter: 0, vals: [], part_of_speech: "", term: "" },
    language_pairs: [],
    term_entry: false
  }
  raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
  @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
end

Instance Attribute Details

#encodingObject (readonly)

Returns the value of attribute encoding.



9
10
11
# File 'lib/tbx_importer.rb', line 9

def encoding
  @encoding
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



9
10
11
# File 'lib/tbx_importer.rb', line 9

def file_path
  @file_path
end

Instance Method Details

#importObject



47
48
49
50
51
# File 'lib/tbx_importer.rb', line 47

def import
  reader = read_file
  parse_file(reader)
  [@doc[:tc][:vals], @doc[:term][:vals]]
end

#statsObject



38
39
40
41
42
43
44
45
# File 'lib/tbx_importer.rb', line 38

def stats
  if encoding.eql?('UTF-8')
    analyze_stats_utf_8
  else
    analyze_stats_utf_16
  end
  {tc_count: @doc[:tc][:counter], term_count: @doc[:term][:counter], language_pairs: @doc[:language_pairs].uniq}
end