Class: Tokenizer::Tokenizer

Inherits:

Object

Object
Tokenizer::Tokenizer

Defined in:: lib/tokenizer/tokenizer.rb

Constant Summary collapse

FS =

Regexp.new('[[:blank:]]+')

SIMPLE_PRE = spanish marks

[]

PAIR_PRE =

['(', '{', '[', '<']

SIMPLE_POST =

['!', '?', ',', ':', ';', '.']

PAIR_POST =

[')', '}', ']', '>']

PRE_N_POST =

['"', "'"]

PRE =

SIMPLE_PRE + PAIR_PRE

POST =

SIMPLE_POST + PAIR_POST

Instance Method Summary collapse

#initialize(lang = :de, options = {}) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#tokenize(str) ⇒ Object

Constructor Details

#initialize(lang = :de, options = {}) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/tokenizer/tokenizer.rb', line 20

def initialize(lang = :de, options = {})
  @lang = lang
  @options = {
    pre: SIMPLE_PRE + PAIR_PRE,
    post: SIMPLE_POST + PAIR_POST,
    pre_n_post: PRE_N_POST
  }.merge(options)
end

Instance Method Details

#tokenize(str) ⇒ `Object`

# File 'lib/tokenizer/tokenizer.rb', line 29

def tokenize(str)
  output = ''

  fields = str.chomp.split(FS)

  return [''] if fields.empty?

  fields.each do |field|
    field.each_char.with_index do |ch, idx|
      case
      when @options[:pre].include?(ch)
        output << "#{ch}\n"
      when @options[:post].include?(ch)
        output << "\n#{ch}"
        if ['?', '!', '.'].include?(ch)
          output << "\n"
        end
      when @options[:pre_n_post].include?(ch)
        if idx == 0
          output << "#{ch}\n"
        elsif idx != 0
          output << "\n#{ch}"
        end
      else
        output << ch
      end
    end

    output << "\n"
  end

  # @TODO: Rework the format of the string!
  output.chomp('').split("\n", -1)
end

Class: Tokenizer::Tokenizer

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang = :de, options = {}) ⇒ Tokenizer

Instance Method Details

#tokenize(str) ⇒ Object

#initialize(lang = :de, options = {}) ⇒ `Tokenizer`

#tokenize(str) ⇒ `Object`