Class: Persian::Text

Inherits:

Object

Object
Persian::Text

show all

Includes:: Alphabet

Defined in:: lib/persian/dynamic.rb,
lib/persian/text/text.rb,
lib/persian/text/keyboard.rb,
lib/persian/list/character.rb

Overview

Class text

Constant Summary collapse

AR_FA_CHAR =

{
  KAF_ARABIC => KAF,
  'دِ' => 'د',
  'بِ' => 'ب',
  'زِ' => 'ز',
  'ذِ' => 'ذ',
  'شِ' => 'ش',
  'سِ' => 'س',
  'ى' => 'ی',
  YE_ARABIC => YE,
  'ة' => 'ه',
  'هٔ' => 'ه'
}.freeze

HAREKATS =

[
  AA, # Ae
  EE, # E
  OO, # O
  AN, # An
  EN, # En
  ON, # On
  SAKEN, # Saken
  TASHDID # Tashdid
].freeze

BRACKETS =

[
  '[',
  ']',
  '{',
  '}',
  '<',
  '>',
  '«',
  '»'
].freeze

SIGNS =

[
  '!',
  '@',
  '#',
  '$',
  '%',
  '&',
  '*',
  '~',
  '`',
  '\'',
  '"',
  ':',
  ';',
  '.',
  '?',
  '<',
  '>',
  '/',
  '-',
  '+',
  '-',
  '_',
  '^',
  MAD,
  NOGHTE,
  VIRGOOL,
  NOGHTEVIRGOOL,
  DONOGHTE,
  TAAJOB,
  SOAL,
  BEALAVE,
  DARSAD,
  MENHA,
  MOSAVI,
  TAGHSIM,
  ZARBDAR,
  KESH
].freeze

END_VOWEL =

[
  HE_DOCHESHM,
  ALEF,
  VAV
].freeze

EN_FA_KEYBOARD_CHAR = Exchange Standard QWERTY Keyboard layout

{
  # Lowercase Letters
  'q' => ZAD,
  'w' => SAD,
  'e' => THE,
  'r' => QAF,
  't' => FE,
  'y' => GHEIN,
  'u' => EIN,
  'i' => HE_DOCHESHM,
  'o' => KHE,
  'p' => HE_JIMI,
  '[' => JIM,
  ']' => CHE,
  '\\' => '\\',
  'a' => SHIN,
  's' => SIN,
  'd' => YE,
  'f' => BE,
  'g' => LAM,
  'h' => ALEF,
  'j' => TE,
  'k' => NOON,
  'l' => MIM,
  ';' => KAF,
  '\'' => GAF,
  'z' => ZA,
  'x' => TA,
  'c' => ZE,
  'v' => RE,
  'b' => ZAL,
  'n' => DAL,
  'm' => PE,
  ',' => VAV,
  '.' => '.',
  '/' => '/',
  # Uppercase Letters
  'Q' => 'ْ',
  'W' => 'ٌ',
  'E' => 'ٍ',
  'R' => 'ً',
  'T' => 'ُ',
  'Y' => 'ِ',
  'U' => 'َ',
  'I' => 'ّ',
  'O' => ']',
  'P' => '[',
  '{' => '}',
  '}' => '{',
  '|' => '|',
  'A' => 'ؤ',
  'S' => 'ئ',
  'D' => 'ي',
  'F' => 'إ',
  'G' => 'أ',
  'H' => 'آ',
  'J' => 'ة',
  'K' => '»',
  'L' => '«',
  ':' => ':',
  '"' => '؛',
  'Z' => 'ك',
  'X' => 'ٓ',
  'C' => 'ژ',
  'V' => 'ٰ',
  'B' => '‌',
  'N' => 'ٔ',
  'M' => 'ء',
  '<' => '>',
  '>' => '<',
  '?' => '؟',
  # Numbers without shift key
  '`' => '‍',
  '1' => YEK,
  '2' => DOW,
  '3' => SE,
  '4' => CHAHAR,
  '5' => PANJ,
  '6' => SHESH,
  '7' => HAFT,
  '8' => HASHT,
  '9' => NOH,
  '0' => SEFR,
  '-' => '-',
  '=' => '=',
  # Numbers With Shift key
  '~' => '÷',
  '!' => '!',
  '@' => '٬',
  '#' => '٫',
  '$' => '﷼',
  '%' => '٪',
  '^' => '×',
  '&' => '،',
  '*' => '*',
  '(' => ')',
  ')' => '(',
  '_' => 'ـ',
  '+' => '+'
}.freeze

Constants included from Alphabet

Alphabet::AA, Alphabet::ALEF, Alphabet::ALEF_MAD, Alphabet::AN, Alphabet::ARBE, Alphabet::ATHNAN, Alphabet::BE, Alphabet::BEALAVE, Alphabet::CHAHAR, Alphabet::CHE, Alphabet::DAL, Alphabet::DARSAD, Alphabet::DONOGHTE, Alphabet::DOW, Alphabet::EE, Alphabet::EIGHT, Alphabet::EIN, Alphabet::EN, Alphabet::FE, Alphabet::FIVE, Alphabet::FOUR, Alphabet::GAF, Alphabet::GHEIN, Alphabet::HAFT, Alphabet::HASHT, Alphabet::HE_DOCHESHM, Alphabet::HE_JIMI, Alphabet::JIM, Alphabet::KAF, Alphabet::KAF_ARABIC, Alphabet::KESH, Alphabet::KHAMSE, Alphabet::KHE, Alphabet::LAM, Alphabet::LAYS, Alphabet::MAD, Alphabet::MENHA, Alphabet::MIM, Alphabet::MOSAVI, Alphabet::NINE, Alphabet::NOGHTE, Alphabet::NOGHTEVIRGOOL, Alphabet::NOH, Alphabet::NOON, Alphabet::ON, Alphabet::ONE, Alphabet::OO, Alphabet::PANJ, Alphabet::PE, Alphabet::QAF, Alphabet::RE, Alphabet::SABE, Alphabet::SAD, Alphabet::SAKEN, Alphabet::SE, Alphabet::SEFR, Alphabet::SETE, Alphabet::SEVEN, Alphabet::SHESH, Alphabet::SHIN, Alphabet::SIFR, Alphabet::SIN, Alphabet::SIX, Alphabet::SOAL, Alphabet::SPACE, Alphabet::TA, Alphabet::TAAJOB, Alphabet::TAGHSIM, Alphabet::TASHDID, Alphabet::TE, Alphabet::THALETH, Alphabet::THE, Alphabet::THMANY, Alphabet::THREE, Alphabet::TWO, Alphabet::VAV, Alphabet::VIRGOOL, Alphabet::WAHID, Alphabet::YE, Alphabet::YEK, Alphabet::YE_ARABIC, Alphabet::ZA, Alphabet::ZAD, Alphabet::ZAL, Alphabet::ZARBDAR, Alphabet::ZE, Alphabet::ZERO, Alphabet::ZHE, Alphabet::ZWJ, Alphabet::ZWNJ

Class Method Summary collapse

.add_zwnj(text, point) ⇒ Object
.ast(text) ⇒ Object

Resplace ست with sاست if lastest character before s is ا.
.character(text) ⇒ Object

Replace Arabic characters with Persian characters.
.constant?(const_name) ⇒ Boolean
.english_to_persian_char(text) ⇒ Object

Replace english characters with it’s key persian value on standard persian keyboard For now just support QWERTY keyboard.
.fix_y_after_vowel(text) ⇒ Object

Add ‘‌ی’ after names that end with ه, ا, و.
.general_brackets(text, left = '«', right = '»') ⇒ Object

Replace general brackets with one type brackets Default: 0xAB & 0xBB.
.get_constant(const_name) ⇒ Object
.keshide(text) ⇒ Object

Remove keshide from text.
.method_missing(method, *arg, &block) ⇒ Object
.persian_to_english_char(text) ⇒ Object

Replace standard persian keyboard characters with it’s key persian value on english keyboard For now just support QWERTY keyboard.
.remove_brackets(text) ⇒ Object

Remove All barckets.
.remove_extra_question_mark(text) ⇒ Object
.remove_extra_spaces(text) ⇒ Object

Remove extra spaces in text.
.remove_harekats(text) ⇒ Object

Remove Arabic harecats from text.
.remove_noghtevirgool_baz_start(text) ⇒ Object
.remove_noghtevirgool_para_end(text) ⇒ Object
.remove_postfix(text, postfix) ⇒ Object

Remove specific character from end of text EXample: remove_postfix(‘پسره’,‘ه’).
.remove_question_exclamation(text) ⇒ Object
.remove_signs(text, with = '') ⇒ Object

Remove Persian signs.
.remove_signs_after_noghtevirgool(text) ⇒ Object
.remove_signs_after_virgool(text) ⇒ Object
.remove_space_before_virgool(text) ⇒ Object
.remove_space_noghtevirgool(text) ⇒ Object
.remove_stopwords(text) ⇒ Object
.replace_e_y(text) ⇒ Object

Use ی instead of ئ if next char is ی Example پائیز => پاییز.
.replace_zwnj_mi(text) ⇒ Object

Replace Space with Zero-width none-joiner after می and نمی.
.replace_zwnj_with_space(text) ⇒ Object
.respond_to_missing?(method, include_private = false) ⇒ Boolean
.rm_char(text, char) ⇒ Object
.rm_virgool_in_end(text) ⇒ Object
.space_after_dot(text) ⇒ Object
.space_after_noghtevirgool(text) ⇒ Object
.space_after_virgool(text) ⇒ Object
.squeeze(text) ⇒ Object
.suffix(text) ⇒ Object
.three_dots(text) ⇒ Object

Class Method Details

.add_zwnj(text, point) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 115

def self.add_zwnj(text, point)
  text = text.scan(/^.{#{point}}|.+/).join('‌')
  text
end

.ast(text) ⇒ `Object`

Resplace ست with sاست if lastest character before s is ا

# File 'lib/persian/text/text.rb', line 70

def self.ast(text)
  a = 'ا'
  ast = 'است'
  st = 'ست'

  text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
  text
end

.character(text) ⇒ `Object`

Replace Arabic characters with Persian characters.

# File 'lib/persian/text/text.rb', line 9

def self.character(text)
  AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
  text
end

.constant?(const_name) ⇒ `Boolean`



30
31
32

# File 'lib/persian/dynamic.rb', line 30

def self.constant?(const_name)
  Persian.const_defined?(const_name)
end

.english_to_persian_char(text) ⇒ `Object`

Replace english characters with it’s key persian value on standard persian keyboard For now just support QWERTY keyboard

# File 'lib/persian/text/keyboard.rb', line 10

def self.english_to_persian_char(text)
  EN_FA_KEYBOARD_CHAR.each { |k, v| text.gsub!(k, v) }
  text
end

.fix_y_after_vowel(text) ⇒ `Object`

Add ‘‌ی’ after names that end with ه, ا, و

# File 'lib/persian/text/text.rb', line 56

def self.fix_y_after_vowel(text)
  text += '‌ی' if END_VOWEL.include? text[-1]
  text
end

.general_brackets(text, left = '«', right = '»') ⇒ `Object`

Replace general brackets with one type brackets Default: 0xAB & 0xBB

# File 'lib/persian/text/text.rb', line 47

def self.general_brackets(text, left = '«', right = '»')
  text = text.gsub(/"(.*?)"/, left + '\1' + right)
  text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
  text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
  text = text.gsub(/\((.*?)\)/, left + '\1' + right)
  text
end

.get_constant(const_name) ⇒ `Object`



34
35
36

# File 'lib/persian/dynamic.rb', line 34

def self.get_constant(const_name)
  Persian.const_get(const_name)
end

.keshide(text) ⇒ `Object`

Remove keshide from text

# File 'lib/persian/text/text.rb', line 80

def self.keshide(text)
  text.gsub!(/ـ+/, '')
  text
end

.method_missing(method, *arg, &block) ⇒ `Object`

# File 'lib/persian/dynamic.rb', line 7

def self.method_missing(method, *arg, &block)
  # remove methods
  if method.to_s =~ /^remove_\w*/
    # get method characters without remove_
    char = method.to_s.gsub(/^remove_(\w*)/, '\1').upcase

    # execute remove_character if char is a valid constant
    if constant? char
      text = Persian.rm_char(arg[0], get_constant(char))
      text
    else
      super
    end
  else
    # Run default no method error
    super
  end
end

.persian_to_english_char(text) ⇒ `Object`

Replace standard persian keyboard characters with it’s key persian value on english keyboard For now just support QWERTY keyboard

# File 'lib/persian/text/keyboard.rb', line 17

def self.persian_to_english_char(text)
  EN_FA_KEYBOARD_CHAR.each { |v, k| text.gsub!(k, v) }
  text
end

.remove_brackets(text) ⇒ `Object`

Remove All barckets

# File 'lib/persian/text/text.rb', line 28

def self.remove_brackets(text)
  BRACKETS.each { |v| text = text.gsub(v, '') }
  text
end

.remove_extra_question_mark(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 109

def self.remove_extra_question_mark(text)
  mark = '؟'
  text.gsub!(/(#{mark}){2,}/, '\1')
  text
end

.remove_extra_spaces(text) ⇒ `Object`

Remove extra spaces in text

# File 'lib/persian/text/text.rb', line 15

def self.remove_extra_spaces(text)
  text = text.split.join(' ')
  text = text.split('‌').join('‌')
  text
end

.remove_harekats(text) ⇒ `Object`

Remove Arabic harecats from text

# File 'lib/persian/text/text.rb', line 22

def self.remove_harekats(text)
  HAREKATS.each { |v| text = text.gsub(v, '') }
  text
end

.remove_noghtevirgool_baz_start(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 159

def self.remove_noghtevirgool_baz_start(text)
  noghtevirgool = '؛'

  regex = /([\(\[«])[ ‌]*[#{noghtevirgool}]/
  text.gsub!(regex, '\1')
  text
end

.remove_noghtevirgool_para_end(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 153

def self.remove_noghtevirgool_para_end(text)
  noghtevirgool = '؛'
  text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
  text
end

.remove_postfix(text, postfix) ⇒ `Object`

Remove specific character from end of text EXample: remove_postfix(‘پسره’,‘ه’)

# File 'lib/persian/text/text.rb', line 209

def self.remove_postfix(text, postfix)
  text.chomp!(postfix)
  text
end

.remove_question_exclamation(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 120

def self.remove_question_exclamation(text)
  question = '؟'
  exclamation = '!'
  text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
  text
end

.remove_signs(text, with = '') ⇒ `Object`

Remove Persian signs

# File 'lib/persian/text/text.rb', line 34

def self.remove_signs(text, with = '')
  return '' if text.nil?
  SIGNS.each { |v| text = text.gsub(v, with) }
  text
end

.remove_signs_after_noghtevirgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 140

def self.remove_signs_after_noghtevirgool(text)
  signs = '[\.،؛:!؟\-…]'
  noghtevirgool = '؛'
  text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
  text
end

.remove_signs_after_virgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 174

def self.remove_signs_after_virgool(text)
  pattern = /(،)([ ‌]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/

  text.gsub!(pattern, '\1\2')
  text
end

.remove_space_before_virgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 167

def self.remove_space_before_virgool(text)
  virgool = '،'

  text.gsub!(/\s+(#{virgool})/, '\1')
  text
end

.remove_space_noghtevirgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 134

def self.remove_space_noghtevirgool(text)
  noghtevirgool = '؛'
  text.gsub!(/\s+(#{noghtevirgool})/, '\1')
  text
end

.remove_stopwords(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 127

def self.remove_stopwords(text)
  stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
  words = text.scan(/\S+/)
  keywords = words.select { |word| !stopwords.include?(word) }
  keywords.join(' ')
end

.replace_e_y(text) ⇒ `Object`

Use ی instead of ئ if next char is یExample پائیز => پاییز

# File 'lib/persian/text/text.rb', line 87

def self.replace_e_y(text)
  e = 'ئ'
  y = 'ی'
  text.gsub!(/#{e}(#{y})/, '\1\1')
  text
end

.replace_zwnj_mi(text) ⇒ `Object`

Replace Space with Zero-width none-joiner after می and نمی

# File 'lib/persian/text/text.rb', line 62

def self.replace_zwnj_mi(text)
  mi = 'می'
  nmi = 'نمی'
  text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2‌\3')
  text
end

.replace_zwnj_with_space(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 40

def self.replace_zwnj_with_space(text)
  text = text.gsub(/(‌)/, ' ')
  text
end

.respond_to_missing?(method, include_private = false) ⇒ `Boolean`



26
27
28

# File 'lib/persian/dynamic.rb', line 26

def self.respond_to_missing?(method, include_private = false)
  method.to_s.start_with?('remove_') || super
end

.rm_char(text, char) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 188

def self.rm_char(text, char)
  text.gsub!(/(#{char})/, '')
  text
end

.rm_virgool_in_end(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 193

def self.rm_virgool_in_end(text)
  text.gsub!(/(،)([ ‌\n]+)?$/, '.\2')
  text
end

.space_after_dot(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 198

def self.space_after_dot(text)
  text.gsub!(/(\.)(\S)/, '\1 \2')
  text
end

.space_after_noghtevirgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 147

def self.space_after_noghtevirgool(text)
  noghtevirgool = '؛'
  text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
  text
end

.space_after_virgool(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 181

def self.space_after_virgool(text)
  virgool = '،'

  text.gsub!(/(#{virgool})(\S)/, '\1 \2')
  text
end

.squeeze(text) ⇒ `Object`



203
204
205

# File 'lib/persian/text/text.rb', line 203

def self.squeeze(text)
  text.squeeze
end

.suffix(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 99

def self.suffix(text)
  tar = 'تر'
  ee = 'ی'
  n = 'ن'
  ha = 'ها'
  ye = 'ی'
  text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '‌\1')
  text
end

.three_dots(text) ⇒ `Object`

# File 'lib/persian/text/text.rb', line 94

def self.three_dots(text)
  text.gsub!(/\.{3,}/, '…')
  text
end

Class: Persian::Text

Overview

Constant Summary collapse

Constants included from Alphabet

Class Method Summary collapse

Class Method Details

.add_zwnj(text, point) ⇒ Object

.ast(text) ⇒ Object

.character(text) ⇒ Object

.constant?(const_name) ⇒ Boolean

.english_to_persian_char(text) ⇒ Object

.fix_y_after_vowel(text) ⇒ Object

.general_brackets(text, left = '«', right = '»') ⇒ Object

.get_constant(const_name) ⇒ Object

.keshide(text) ⇒ Object

.method_missing(method, *arg, &block) ⇒ Object

.persian_to_english_char(text) ⇒ Object

.remove_brackets(text) ⇒ Object

.remove_extra_question_mark(text) ⇒ Object

.remove_extra_spaces(text) ⇒ Object

.remove_harekats(text) ⇒ Object

.remove_noghtevirgool_baz_start(text) ⇒ Object

.remove_noghtevirgool_para_end(text) ⇒ Object

.remove_postfix(text, postfix) ⇒ Object

.remove_question_exclamation(text) ⇒ Object

.remove_signs(text, with = '') ⇒ Object

.remove_signs_after_noghtevirgool(text) ⇒ Object

.remove_signs_after_virgool(text) ⇒ Object

.remove_space_before_virgool(text) ⇒ Object

.remove_space_noghtevirgool(text) ⇒ Object

.remove_stopwords(text) ⇒ Object

.replace_e_y(text) ⇒ Object

.replace_zwnj_mi(text) ⇒ Object

.replace_zwnj_with_space(text) ⇒ Object

.respond_to_missing?(method, include_private = false) ⇒ Boolean

.rm_char(text, char) ⇒ Object

.rm_virgool_in_end(text) ⇒ Object

.space_after_dot(text) ⇒ Object

.space_after_noghtevirgool(text) ⇒ Object

.space_after_virgool(text) ⇒ Object

.squeeze(text) ⇒ Object

.suffix(text) ⇒ Object

.three_dots(text) ⇒ Object