Class: Persian::Text
- Inherits:
-
Object
show all
- Includes:
- Alphabet
- Defined in:
- lib/persian/dynamic.rb,
lib/persian/text/text.rb,
lib/persian/text/keyboard.rb,
lib/persian/list/character.rb
Overview
Constant Summary
collapse
- AR_FA_CHAR =
{
KAF_ARABIC => KAF,
'دِ' => 'د',
'بِ' => 'ب',
'زِ' => 'ز',
'ذِ' => 'ذ',
'شِ' => 'ش',
'سِ' => 'س',
'ى' => 'ی',
YE_ARABIC => YE,
'ة' => 'ه',
'هٔ' => 'ه'
}.freeze
- HAREKATS =
[
AA, EE, OO, AN, EN, ON, SAKEN, TASHDID ].freeze
- BRACKETS =
[
'[',
']',
'{',
'}',
'<',
'>',
'«',
'»'
].freeze
- SIGNS =
[
'!',
'@',
'#',
'$',
'%',
'&',
'*',
'~',
'`',
'\'',
'"',
':',
';',
'.',
'?',
'<',
'>',
'/',
'-',
'+',
'-',
'_',
'^',
MAD,
NOGHTE,
VIRGOOL,
NOGHTEVIRGOOL,
DONOGHTE,
TAAJOB,
SOAL,
BEALAVE,
DARSAD,
MENHA,
MOSAVI,
TAGHSIM,
ZARBDAR,
KESH
].freeze
- END_VOWEL =
[
HE_DOCHESHM,
ALEF,
VAV
].freeze
- EN_FA_KEYBOARD_CHAR =
Exchange Standard QWERTY Keyboard layout
{
'q' => ZAD,
'w' => SAD,
'e' => THE,
'r' => QAF,
't' => FE,
'y' => GHEIN,
'u' => EIN,
'i' => HE_DOCHESHM,
'o' => KHE,
'p' => HE_JIMI,
'[' => JIM,
']' => CHE,
'\\' => '\\',
'a' => SHIN,
's' => SIN,
'd' => YE,
'f' => BE,
'g' => LAM,
'h' => ALEF,
'j' => TE,
'k' => NOON,
'l' => MIM,
';' => KAF,
'\'' => GAF,
'z' => ZA,
'x' => TA,
'c' => ZE,
'v' => RE,
'b' => ZAL,
'n' => DAL,
'm' => PE,
',' => VAV,
'.' => '.',
'/' => '/',
'Q' => 'ْ',
'W' => 'ٌ',
'E' => 'ٍ',
'R' => 'ً',
'T' => 'ُ',
'Y' => 'ِ',
'U' => 'َ',
'I' => 'ّ',
'O' => ']',
'P' => '[',
'{' => '}',
'}' => '{',
'|' => '|',
'A' => 'ؤ',
'S' => 'ئ',
'D' => 'ي',
'F' => 'إ',
'G' => 'أ',
'H' => 'آ',
'J' => 'ة',
'K' => '»',
'L' => '«',
':' => ':',
'"' => '؛',
'Z' => 'ك',
'X' => 'ٓ',
'C' => 'ژ',
'V' => 'ٰ',
'B' => '',
'N' => 'ٔ',
'M' => 'ء',
'<' => '>',
'>' => '<',
'?' => '؟',
'`' => '',
'1' => YEK,
'2' => DOW,
'3' => SE,
'4' => CHAHAR,
'5' => PANJ,
'6' => SHESH,
'7' => HAFT,
'8' => HASHT,
'9' => NOH,
'0' => SEFR,
'-' => '-',
'=' => '=',
'~' => '÷',
'!' => '!',
'@' => '٬',
'#' => '٫',
'$' => '﷼',
'%' => '٪',
'^' => '×',
'&' => '،',
'*' => '*',
'(' => ')',
')' => '(',
'_' => 'ـ',
'+' => '+'
}.freeze
Constants included
from Alphabet
Alphabet::AA, Alphabet::ALEF, Alphabet::ALEF_MAD, Alphabet::AN, Alphabet::ARBE, Alphabet::ATHNAN, Alphabet::BE, Alphabet::BEALAVE, Alphabet::CHAHAR, Alphabet::CHE, Alphabet::DAL, Alphabet::DARSAD, Alphabet::DONOGHTE, Alphabet::DOW, Alphabet::EE, Alphabet::EIGHT, Alphabet::EIN, Alphabet::EN, Alphabet::FE, Alphabet::FIVE, Alphabet::FOUR, Alphabet::GAF, Alphabet::GHEIN, Alphabet::HAFT, Alphabet::HASHT, Alphabet::HE_DOCHESHM, Alphabet::HE_JIMI, Alphabet::JIM, Alphabet::KAF, Alphabet::KAF_ARABIC, Alphabet::KESH, Alphabet::KHAMSE, Alphabet::KHE, Alphabet::LAM, Alphabet::LAYS, Alphabet::MAD, Alphabet::MENHA, Alphabet::MIM, Alphabet::MOSAVI, Alphabet::NINE, Alphabet::NOGHTE, Alphabet::NOGHTEVIRGOOL, Alphabet::NOH, Alphabet::NOON, Alphabet::ON, Alphabet::ONE, Alphabet::OO, Alphabet::PANJ, Alphabet::PE, Alphabet::QAF, Alphabet::RE, Alphabet::SABE, Alphabet::SAD, Alphabet::SAKEN, Alphabet::SE, Alphabet::SEFR, Alphabet::SETE, Alphabet::SEVEN, Alphabet::SHESH, Alphabet::SHIN, Alphabet::SIFR, Alphabet::SIN, Alphabet::SIX, Alphabet::SOAL, Alphabet::SPACE, Alphabet::TA, Alphabet::TAAJOB, Alphabet::TAGHSIM, Alphabet::TASHDID, Alphabet::TE, Alphabet::THALETH, Alphabet::THE, Alphabet::THMANY, Alphabet::THREE, Alphabet::TWO, Alphabet::VAV, Alphabet::VIRGOOL, Alphabet::WAHID, Alphabet::YE, Alphabet::YEK, Alphabet::YE_ARABIC, Alphabet::ZA, Alphabet::ZAD, Alphabet::ZAL, Alphabet::ZARBDAR, Alphabet::ZE, Alphabet::ZERO, Alphabet::ZHE, Alphabet::ZWJ, Alphabet::ZWNJ
Class Method Summary
collapse
-
.add_zwnj(text, point) ⇒ Object
-
.ast(text) ⇒ Object
Resplace ست with sاست if lastest character before s is ا.
-
.character(text) ⇒ Object
Replace Arabic characters with Persian characters.
-
.constant?(const_name) ⇒ Boolean
-
.english_to_persian_char(text) ⇒ Object
Replace english characters with it’s key persian value on standard persian keyboard For now just support QWERTY keyboard.
-
.fix_y_after_vowel(text) ⇒ Object
Add ‘ی’ after names that end with ه, ا, و.
-
.general_brackets(text, left = '«', right = '»') ⇒ Object
Replace general brackets with one type brackets Default: 0xAB & 0xBB.
-
.get_constant(const_name) ⇒ Object
-
.keshide(text) ⇒ Object
Remove keshide from text.
-
.method_missing(method, *arg, &block) ⇒ Object
-
.persian_to_english_char(text) ⇒ Object
Replace standard persian keyboard characters with it’s key persian value on english keyboard For now just support QWERTY keyboard.
-
.remove_brackets(text) ⇒ Object
-
.remove_extra_question_mark(text) ⇒ Object
-
.remove_extra_spaces(text) ⇒ Object
Remove extra spaces in text.
-
.remove_harekats(text) ⇒ Object
Remove Arabic harecats from text.
-
.remove_noghtevirgool_baz_start(text) ⇒ Object
-
.remove_noghtevirgool_para_end(text) ⇒ Object
-
.remove_postfix(text, postfix) ⇒ Object
Remove specific character from end of text EXample: remove_postfix(‘پسره’,‘ه’).
-
.remove_question_exclamation(text) ⇒ Object
-
.remove_signs(text, with = '') ⇒ Object
-
.remove_signs_after_noghtevirgool(text) ⇒ Object
-
.remove_signs_after_virgool(text) ⇒ Object
-
.remove_space_before_virgool(text) ⇒ Object
-
.remove_space_noghtevirgool(text) ⇒ Object
-
.remove_stopwords(text) ⇒ Object
-
.replace_e_y(text) ⇒ Object
Use ی instead of ئ if next char is ی Example پائیز => پاییز.
-
.replace_zwnj_mi(text) ⇒ Object
Replace Space with Zero-width none-joiner after می and نمی.
-
.replace_zwnj_with_space(text) ⇒ Object
-
.respond_to_missing?(method, include_private = false) ⇒ Boolean
-
.rm_char(text, char) ⇒ Object
-
.rm_virgool_in_end(text) ⇒ Object
-
.space_after_dot(text) ⇒ Object
-
.space_after_noghtevirgool(text) ⇒ Object
-
.space_after_virgool(text) ⇒ Object
-
.squeeze(text) ⇒ Object
-
.suffix(text) ⇒ Object
-
.three_dots(text) ⇒ Object
Class Method Details
.add_zwnj(text, point) ⇒ Object
115
116
117
118
|
# File 'lib/persian/text/text.rb', line 115
def self.add_zwnj(text, point)
text = text.scan(/^.{#{point}}|.+/).join('')
text
end
|
.ast(text) ⇒ Object
Resplace ست with sاست if lastest character before s is ا
70
71
72
73
74
75
76
77
|
# File 'lib/persian/text/text.rb', line 70
def self.ast(text)
a = 'ا'
ast = 'است'
st = 'ست'
text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
text
end
|
.character(text) ⇒ Object
Replace Arabic characters with Persian characters.
9
10
11
12
|
# File 'lib/persian/text/text.rb', line 9
def self.character(text)
AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
text
end
|
.constant?(const_name) ⇒ Boolean
30
31
32
|
# File 'lib/persian/dynamic.rb', line 30
def self.constant?(const_name)
Persian.const_defined?(const_name)
end
|
.english_to_persian_char(text) ⇒ Object
Replace english characters with it’s key persian value on standard persian keyboard For now just support QWERTY keyboard
10
11
12
13
|
# File 'lib/persian/text/keyboard.rb', line 10
def self.english_to_persian_char(text)
EN_FA_KEYBOARD_CHAR.each { |k, v| text.gsub!(k, v) }
text
end
|
.fix_y_after_vowel(text) ⇒ Object
Add ‘ی’ after names that end with ه, ا, و
56
57
58
59
|
# File 'lib/persian/text/text.rb', line 56
def self.fix_y_after_vowel(text)
text += 'ی' if END_VOWEL.include? text[-1]
text
end
|
.general_brackets(text, left = '«', right = '»') ⇒ Object
Replace general brackets with one type brackets Default: 0xAB & 0xBB
47
48
49
50
51
52
53
|
# File 'lib/persian/text/text.rb', line 47
def self.general_brackets(text, left = '«', right = '»')
text = text.gsub(/"(.*?)"/, left + '\1' + right)
text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
text = text.gsub(/\((.*?)\)/, left + '\1' + right)
text
end
|
.get_constant(const_name) ⇒ Object
34
35
36
|
# File 'lib/persian/dynamic.rb', line 34
def self.get_constant(const_name)
Persian.const_get(const_name)
end
|
.keshide(text) ⇒ Object
80
81
82
83
|
# File 'lib/persian/text/text.rb', line 80
def self.keshide(text)
text.gsub!(/ـ+/, '')
text
end
|
.method_missing(method, *arg, &block) ⇒ Object
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# File 'lib/persian/dynamic.rb', line 7
def self.method_missing(method, *arg, &block)
if method.to_s =~ /^remove_\w*/
char = method.to_s.gsub(/^remove_(\w*)/, '\1').upcase
if constant? char
text = Persian.rm_char(arg[0], get_constant(char))
text
else
super
end
else
super
end
end
|
.persian_to_english_char(text) ⇒ Object
Replace standard persian keyboard characters with it’s key persian value on english keyboard For now just support QWERTY keyboard
17
18
19
20
|
# File 'lib/persian/text/keyboard.rb', line 17
def self.persian_to_english_char(text)
EN_FA_KEYBOARD_CHAR.each { |v, k| text.gsub!(k, v) }
text
end
|
.remove_brackets(text) ⇒ Object
28
29
30
31
|
# File 'lib/persian/text/text.rb', line 28
def self.remove_brackets(text)
BRACKETS.each { |v| text = text.gsub(v, '') }
text
end
|
109
110
111
112
113
|
# File 'lib/persian/text/text.rb', line 109
def self.(text)
mark = '؟'
text.gsub!(/(#{mark}){2,}/, '\1')
text
end
|
Remove extra spaces in text
15
16
17
18
19
|
# File 'lib/persian/text/text.rb', line 15
def self.(text)
text = text.split.join(' ')
text = text.split('').join('')
text
end
|
.remove_harekats(text) ⇒ Object
Remove Arabic harecats from text
22
23
24
25
|
# File 'lib/persian/text/text.rb', line 22
def self.remove_harekats(text)
HAREKATS.each { |v| text = text.gsub(v, '') }
text
end
|
.remove_noghtevirgool_baz_start(text) ⇒ Object
159
160
161
162
163
164
165
|
# File 'lib/persian/text/text.rb', line 159
def self.remove_noghtevirgool_baz_start(text)
noghtevirgool = '؛'
regex = /([\(\[«])[ ]*[#{noghtevirgool}]/
text.gsub!(regex, '\1')
text
end
|
.remove_noghtevirgool_para_end(text) ⇒ Object
153
154
155
156
157
|
# File 'lib/persian/text/text.rb', line 153
def self.remove_noghtevirgool_para_end(text)
noghtevirgool = '؛'
text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
text
end
|
.remove_postfix(text, postfix) ⇒ Object
Remove specific character from end of text EXample: remove_postfix(‘پسره’,‘ه’)
209
210
211
212
|
# File 'lib/persian/text/text.rb', line 209
def self.remove_postfix(text, postfix)
text.chomp!(postfix)
text
end
|
.remove_question_exclamation(text) ⇒ Object
120
121
122
123
124
125
|
# File 'lib/persian/text/text.rb', line 120
def self.remove_question_exclamation(text)
question = '؟'
exclamation = '!'
text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
text
end
|
.remove_signs(text, with = '') ⇒ Object
34
35
36
37
38
|
# File 'lib/persian/text/text.rb', line 34
def self.remove_signs(text, with = '')
return '' if text.nil?
SIGNS.each { |v| text = text.gsub(v, with) }
text
end
|
.remove_signs_after_noghtevirgool(text) ⇒ Object
140
141
142
143
144
145
|
# File 'lib/persian/text/text.rb', line 140
def self.remove_signs_after_noghtevirgool(text)
signs = '[\.،؛:!؟\-…]'
noghtevirgool = '؛'
text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
text
end
|
.remove_signs_after_virgool(text) ⇒ Object
174
175
176
177
178
179
|
# File 'lib/persian/text/text.rb', line 174
def self.remove_signs_after_virgool(text)
pattern = /(،)([ ]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/
text.gsub!(pattern, '\1\2')
text
end
|
.remove_space_before_virgool(text) ⇒ Object
167
168
169
170
171
172
|
# File 'lib/persian/text/text.rb', line 167
def self.remove_space_before_virgool(text)
virgool = '،'
text.gsub!(/\s+(#{virgool})/, '\1')
text
end
|
.remove_space_noghtevirgool(text) ⇒ Object
134
135
136
137
138
|
# File 'lib/persian/text/text.rb', line 134
def self.remove_space_noghtevirgool(text)
noghtevirgool = '؛'
text.gsub!(/\s+(#{noghtevirgool})/, '\1')
text
end
|
.remove_stopwords(text) ⇒ Object
127
128
129
130
131
132
|
# File 'lib/persian/text/text.rb', line 127
def self.remove_stopwords(text)
stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
words = text.scan(/\S+/)
keywords = words.select { |word| !stopwords.include?(word) }
keywords.join(' ')
end
|
.replace_e_y(text) ⇒ Object
Use ی instead of ئ if next char is یExample پائیز => پاییز
87
88
89
90
91
92
|
# File 'lib/persian/text/text.rb', line 87
def self.replace_e_y(text)
e = 'ئ'
y = 'ی'
text.gsub!(/#{e}(#{y})/, '\1\1')
text
end
|
.replace_zwnj_mi(text) ⇒ Object
Replace Space with Zero-width none-joiner after می and نمی
62
63
64
65
66
67
|
# File 'lib/persian/text/text.rb', line 62
def self.replace_zwnj_mi(text)
mi = 'می'
nmi = 'نمی'
text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2\3')
text
end
|
.replace_zwnj_with_space(text) ⇒ Object
40
41
42
43
|
# File 'lib/persian/text/text.rb', line 40
def self.replace_zwnj_with_space(text)
text = text.gsub(/()/, ' ')
text
end
|
.respond_to_missing?(method, include_private = false) ⇒ Boolean
26
27
28
|
# File 'lib/persian/dynamic.rb', line 26
def self.respond_to_missing?(method, include_private = false)
method.to_s.start_with?('remove_') || super
end
|
.rm_char(text, char) ⇒ Object
188
189
190
191
|
# File 'lib/persian/text/text.rb', line 188
def self.rm_char(text, char)
text.gsub!(/(#{char})/, '')
text
end
|
.rm_virgool_in_end(text) ⇒ Object
193
194
195
196
|
# File 'lib/persian/text/text.rb', line 193
def self.rm_virgool_in_end(text)
text.gsub!(/(،)([ \n]+)?$/, '.\2')
text
end
|
.space_after_dot(text) ⇒ Object
198
199
200
201
|
# File 'lib/persian/text/text.rb', line 198
def self.space_after_dot(text)
text.gsub!(/(\.)(\S)/, '\1 \2')
text
end
|
.space_after_noghtevirgool(text) ⇒ Object
147
148
149
150
151
|
# File 'lib/persian/text/text.rb', line 147
def self.space_after_noghtevirgool(text)
noghtevirgool = '؛'
text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
text
end
|
.space_after_virgool(text) ⇒ Object
181
182
183
184
185
186
|
# File 'lib/persian/text/text.rb', line 181
def self.space_after_virgool(text)
virgool = '،'
text.gsub!(/(#{virgool})(\S)/, '\1 \2')
text
end
|
.squeeze(text) ⇒ Object
203
204
205
|
# File 'lib/persian/text/text.rb', line 203
def self.squeeze(text)
text.squeeze
end
|
.suffix(text) ⇒ Object
99
100
101
102
103
104
105
106
107
|
# File 'lib/persian/text/text.rb', line 99
def self.suffix(text)
tar = 'تر'
ee = 'ی'
n = 'ن'
ha = 'ها'
ye = 'ی'
text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '\1')
text
end
|
.three_dots(text) ⇒ Object
94
95
96
97
|
# File 'lib/persian/text/text.rb', line 94
def self.three_dots(text)
text.gsub!(/\.{3,}/, '…')
text
end
|