Class: Persian::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/persian/tokenizer.rb

Overview

Persian tokenize class

Class Method Summary collapse

Class Method Details

.split_paragraphs(text) ⇒ Object

Split paragraphs Return an array of paragraphs



51
52
53
54
# File 'lib/persian/tokenizer.rb', line 51

def self.split_paragraphs(text)
  text = text.split("\n").reject(&:empty?)
  text
end

.tokenize(text) ⇒ Object

Basic persian word tokenizer Return an array of words



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/persian/tokenizer.rb', line 9

def self.tokenize(text)
  symbols = ['!', '', ':', '؛', '؟', '،', '-', '.']
  pair_pre = ['(', '{', '«', '<', '[']
  pair_post = [')', '}', '»', '>', ']']
  prepost = ["'", '"']

  # Split text with space characters
  splits = text.split(/\s/)

  return [''] if splits.empty?

  options = symbols + pair_pre + pair_post + prepost

  pattern = /[^#{Regexp.escape(options.join)}]+/
  tokens = []

  splits.each do |split|
    first, middle, last = split.partition(pattern)
    tokens << first.split unless first.empty?
    tokens << middle unless middle.empty?
    tokens << last.split unless last.empty?
  end

  tokens.flatten
end

.tokenize_more(text, num) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/persian/tokenizer.rb', line 35

def self.tokenize_more(text, num)
  list = tokenize(text)
  tokens = []
  0.upto list.size - num do |i|
    token = ''
    0.upto num - 1 do |j|
      token += list[i + j] + ' '
    end
    tokens.push token.strip
  end

  tokens
end