Class: TbxImporter::Tbx
- Inherits:
-
Object
- Object
- TbxImporter::Tbx
- Defined in:
- lib/tbx_importer.rb
Instance Attribute Summary collapse
-
#encoding ⇒ Object
readonly
Returns the value of attribute encoding.
-
#file_path ⇒ Object
readonly
Returns the value of attribute file_path.
Instance Method Summary collapse
- #import ⇒ Object
-
#initialize(file_path:, **args) ⇒ Tbx
constructor
A new instance of Tbx.
- #stats ⇒ Object
Constructor Details
#initialize(file_path:, **args) ⇒ Tbx
Returns a new instance of Tbx.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/tbx_importer.rb', line 10 def initialize(file_path:, **args) @file_path = file_path @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8') if args[:encoding].nil? @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding] if @encoding.nil? encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase if encoding_in_file.eql?('UTF-8') @encoding = ('UTF-8') elsif encoding_in_file.eql?('UTF-16') @encoding = ('UTF-16LE') end end else @encoding = args[:encoding].upcase end @doc = { source_language: "", tc: { id: "", counter: 0, vals: [], lang: "", definition: "" }, term: { lang: "", counter: 0, vals: [], part_of_speech: "" }, language_pairs: [], term_entry: false } raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil? raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE') @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8') end |
Instance Attribute Details
#encoding ⇒ Object (readonly)
Returns the value of attribute encoding.
9 10 11 |
# File 'lib/tbx_importer.rb', line 9 def encoding @encoding end |
#file_path ⇒ Object (readonly)
Returns the value of attribute file_path.
9 10 11 |
# File 'lib/tbx_importer.rb', line 9 def file_path @file_path end |
Instance Method Details
#import ⇒ Object
47 48 49 50 51 |
# File 'lib/tbx_importer.rb', line 47 def import reader = read_file parse_file(reader) [@doc[:tc][:vals], @doc[:term][:vals]] end |
#stats ⇒ Object
38 39 40 41 42 43 44 45 |
# File 'lib/tbx_importer.rb', line 38 def stats if encoding.eql?('UTF-8') analyze_stats_utf_8 else analyze_stats_utf_16 end {tc_count: @doc[:tc][:counter], term_count: @doc[:term][:counter], language_pairs: @doc[:language_pairs].uniq} end |