Class: TbxImporter::Tbx

Inherits:
Object
  • Object
show all
Defined in:
lib/tbx_importer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path:, **args) ⇒ Tbx

Returns a new instance of Tbx.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/tbx_importer.rb', line 10

def initialize(file_path:, **args)
  @file_path = file_path
  @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
  if args[:encoding].nil?
    @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
    if @encoding.nil?
      encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
      if encoding_in_file.eql?('UTF-8')
        @encoding = ('UTF-8')
      elsif encoding_in_file.eql?('UTF-16')
        @encoding = ('UTF-16LE')
      end
    end
  else
    @encoding = args[:encoding].upcase
  end
  @doc = {
    source_language: "",
    tc: { id: "", counter: 0, vals: [], lang: "", definition: "" },
    term: { lang: "", counter: 0, vals: [], part_of_speech: "" },
    language_pairs: [],
    term_entry: false
  }
  raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
  @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
end

Instance Attribute Details

#encodingObject (readonly)

Returns the value of attribute encoding.



9
10
11
# File 'lib/tbx_importer.rb', line 9

def encoding
  @encoding
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



9
10
11
# File 'lib/tbx_importer.rb', line 9

def file_path
  @file_path
end

Instance Method Details

#importObject



47
48
49
50
51
# File 'lib/tbx_importer.rb', line 47

def import
  reader = read_file
  parse_file(reader)
  [@doc[:tc][:vals], @doc[:term][:vals]]
end

#statsObject



38
39
40
41
42
43
44
45
# File 'lib/tbx_importer.rb', line 38

def stats
  if encoding.eql?('UTF-8')
    analyze_stats_utf_8
  else
    analyze_stats_utf_16
  end
  {tc_count: @doc[:tc][:counter], term_count: @doc[:term][:counter], language_pairs: @doc[:language_pairs].uniq}
end