Module: Rababa::Harakats

Included in:
Diacritizer, Encoders::TextEncoder
Defined in:
lib/rababa/harakats.rb

Instance Method Summary collapse

Instance Method Details

#basic_cleaners(text) ⇒ Object

strip + remove redundancy in whitespaces



93
94
95
# File 'lib/rababa/harakats.rb', line 93

def basic_cleaners(text)
    collapse_whitespace(text).strip
end

#collapse_whitespace(text) ⇒ Object

‘a a a a’-> ‘a a a a’



88
89
90
# File 'lib/rababa/harakats.rb', line 88

def collapse_whitespace(text)
    text.gsub(/[[:space:]]+/, ' ')
end

#extract_haraqat(text, correct_reversed) ⇒ Object

Args:

text (str): text to be diacritized

Returns:

text: the text as came
text_list: all text that are not haraqat
vec_haraqat: all vec_haraqat


44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/rababa/harakats.rb', line 44

def extract_haraqat(text, correct_reversed)
    if text.strip.length == 0
        return text, [" "] * text.length, [""] * text.length
    end

    stack = []
    vec_haraqat = []
    vec_txt = []
    text.chars.each do |char|
        # if chart is a diacritic, then extract the stack and empty it
        if !ArabicConstants::BASIC_HARAQAT.keys.include? char
            stack_content = extract_stack(stack, correct_reversed)
            vec_haraqat.push(stack_content)
            vec_txt.push(char)
            stack = []
        else
            stack.push(char)
        end
    end

    if vec_haraqat.length > 0
        vec_haraqat.shift
    end

    vec_haraqat.push(extract_stack(stack, true))

    [text, vec_txt, vec_haraqat]
end

#extract_stack(stack, correct_reversed) ⇒ Object

Given stack, we extract its content to string, and check whether this string is available at all_possible_haraqat list: if not we raise an error. When correct_reversed is set, we also check the reversed order of the string, if it was not already correct.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/rababa/harakats.rb', line 13

def extract_stack(stack, correct_reversed)
    char_haraqat = []

    while stack.length != 0
        char_haraqat << stack.pop
    end

    full_haraqah = char_haraqat.join("")
    reversed_full_haraqah = char_haraqat.reverse().join("")

    if ArabicConstants::ALL_POSSIBLE_HARAQAT.include? full_haraqah
        out = full_haraqah
    elsif ArabicConstants::ALL_POSSIBLE_HARAQAT.include? reversed_full_haraqah &&  correct_reversed
        out = reversed_full_haraqah
    else
        val = full_haraqah.map{|diac| \
                ArabicConstants::ALL_POSSIBLE_HARAQAT[diac]}.join('|')

        raise ValueError.new('The chart has the following haraqat which are
                              not found in all possible haraqat: ' + val)
    end

    out
end

#remove_diacritics(text) ⇒ Object

Args:

text (str): text to be diacritized

Returns:

text: the text as came
#? text_list: all text that are not haraqat
#? haraqat_list: all haraqat_list


79
80
81
82
83
84
85
# File 'lib/rababa/harakats.rb', line 79

def remove_diacritics(text)
    Rababa::ArabicConstants::UBASIC_HARAQAT.keys.each do |diacritic|
        text.gsub(diacritic, "")
    end

    text
end

#valid_arabic_cleaners(text) ⇒ Object

filter arabic only + basic cleaner



98
99
100
101
102
# File 'lib/rababa/harakats.rb', line 98

def valid_arabic_cleaners(text)
    text = text.chars.select {|c| Rababa::ArabicConstants::VALID_ARABIC.include? c}
    text = collapse_whitespace(text.join('')).strip
    text.strip
end