Class: Rababa::Encoders::TextEncoder
- Inherits:
-
Object
- Object
- Rababa::Encoders::TextEncoder
- Includes:
- Harakats
- Defined in:
- lib/rababa/encoders.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#input_id_to_symbol ⇒ Object
Returns the value of attribute input_id_to_symbol.
-
#input_pad_id ⇒ Object
Returns the value of attribute input_pad_id.
-
#start_symbol_id ⇒ Object
Returns the value of attribute start_symbol_id.
-
#target_id_to_symbol ⇒ Object
Returns the value of attribute target_id_to_symbol.
-
#utarget_id_to_symbol ⇒ Object
Returns the value of attribute utarget_id_to_symbol.
Instance Method Summary collapse
-
#clean(text) ⇒ Object
cleaner, should be a method instantiated at init.
-
#initialize(input_chars, target_charts, cleaner, reverse_input) ⇒ TextEncoder
constructor
A new instance of TextEncoder.
-
#input_to_sequence(text) ⇒ Object
String -> Seq of chars -> List of indices.
Methods included from Harakats
#basic_cleaners, #collapse_whitespace, #extract_haraqat, #extract_stack, #remove_diacritics, #valid_arabic_cleaners
Constructor Details
#initialize(input_chars, target_charts, cleaner, reverse_input) ⇒ TextEncoder
Returns a new instance of TextEncoder.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/rababa/encoders.rb', line 20 def initialize(input_chars, target_charts, cleaner, reverse_input) # cleaner fcts @cleaner = cleaner @pad = "P" @input_symbols = [@pad] + input_chars @target_symbols = [@pad] + target_charts # encoding of arabic without diacritics @input_symbol_to_id = Hash[*@input_symbols.map.with_index \ {|s, i| [s, i] }.flatten] @input_id_to_symbol = Hash[*@input_symbols.map.with_index \ {|s, i| [i, s] }.flatten] # encoding of haraqats @target_symbol_to_id = Hash[*@target_symbols.map.with_index \ {|s, i| [s, i] }.flatten] @target_id_to_symbol = Hash[*@target_symbols.map.with_index \ {|s, i| [i, s] }.flatten] @utarget_id_to_symbol = Hash[ \ *Rababa::ArabicConstants::UALL_POSSIBLE_HARAQAT.keys.map.with_index \ {|s, i| [i, s] }.flatten] @reverse_input = reverse_input @input_pad_id = @input_symbol_to_id[@pad] @start_symbol_id = nil end |
Instance Attribute Details
#input_id_to_symbol ⇒ Object
Returns the value of attribute input_id_to_symbol.
16 17 18 |
# File 'lib/rababa/encoders.rb', line 16 def input_id_to_symbol @input_id_to_symbol end |
#input_pad_id ⇒ Object
Returns the value of attribute input_pad_id.
16 17 18 |
# File 'lib/rababa/encoders.rb', line 16 def input_pad_id @input_pad_id end |
#start_symbol_id ⇒ Object
Returns the value of attribute start_symbol_id.
16 17 18 |
# File 'lib/rababa/encoders.rb', line 16 def start_symbol_id @start_symbol_id end |
#target_id_to_symbol ⇒ Object
Returns the value of attribute target_id_to_symbol.
16 17 18 |
# File 'lib/rababa/encoders.rb', line 16 def target_id_to_symbol @target_id_to_symbol end |
#utarget_id_to_symbol ⇒ Object
Returns the value of attribute utarget_id_to_symbol.
16 17 18 |
# File 'lib/rababa/encoders.rb', line 16 def utarget_id_to_symbol @utarget_id_to_symbol end |
Instance Method Details
#clean(text) ⇒ Object
cleaner, should be a method instantiated at init.
51 52 53 54 55 56 57 |
# File 'lib/rababa/encoders.rb', line 51 def clean(text) if @cleaner == "basic_cleaners" basic_cleaners(text) elsif @cleaner == "valid_arabic_cleaners" valid_arabic_cleaners(text) end end |
#input_to_sequence(text) ⇒ Object
String -> Seq of chars -> List of indices
60 61 62 63 64 65 66 67 68 |
# File 'lib/rababa/encoders.rb', line 60 def input_to_sequence(text) if @reverse_input text = text.chars.reverse.join("") end text.chars.map do |s| @input_symbol_to_id[s] end.map.reject{|i| i.nil?} end |