Class: LLT::Tokenizer

Inherits:

Object

Object
LLT::Tokenizer

show all

Includes:: Constants::Abbreviations, Core::Serviceable, Helpers::Metrical

Defined in:: lib/llt/tokenizer.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb

Defined Under Namespace

Classes: Worker

Constant Summary collapse

PUNCTUATION =

/([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/

ABBREVIATIONS = covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.

/^(#{ALL_ABBRS_PIPED})$/

WORDS_ENDING_WITH_QUE = neque taken out!

/^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i

WORDS_ENDING_WITH_NE =

/^(omne|sine|bene|paene)$/i

WORDS_ENDING_WITH_VE =

/^(sive|neve)$/i

ENCLITICS = laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum but uterque, institutione, sive et al. remain

%w{ que ne ve c }

MERGE_WORDS =

[ %w{ quam diu }, ['non', /null.{1,4}$/]

ABBR_NAME_WITH_DOT =

/^(#{NAMES_PIPED})\.$/

ROMAN_DATE_EXPR_WITH_DOT =

/^(#{DATES_PIPED})\.$/

PUNCT_ITSELF =

Regexp.new(PUNCTUATION.source + '$')

XML_TAG =

/<\/?.+?>/

VERSION =

"0.0.2"

Instance Attribute Summary collapse

#default_options ⇒ Object readonly

Returns the value of attribute default_options.

Class Method Summary collapse

.default_options ⇒ Object

Instance Method Summary collapse

#create_tokens ⇒ Object
#enclitic(val) ⇒ Object
#find_abbreviations_and_join_strings ⇒ Object

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }.
#is_a_mergable_pair?(x, y) ⇒ Boolean
#is_que?(element) ⇒ Boolean
#led_by_preposition?(index) ⇒ Boolean
#lookup(string, type, column, inflection_class = 3) ⇒ Object
#make_frequent_corrections ⇒ Object
#merge_what_needs_merging ⇒ Object

quam diu to quamdiu.
#merge_words(pair, i, to_delete) ⇒ Object
#ne_corrections ⇒ Object
#original_word(i) ⇒ Object
#preliminary ⇒ Object
#put_xml_attributes_back_together(elements) ⇒ Object
#que_corrections ⇒ Object
#raise_id ⇒ Object
#reset_id ⇒ Object
#reverse_splittings(indices) ⇒ Object
#setup(text, options = {}, worker = []) ⇒ Object
#setup_worker(worker) ⇒ Object

This is here for two reasons: 1) easier test setup, when a preliminary result shall be further evaluated.
#shift_range(shifting_enabled) ⇒ Object
#split_enklitika_and_change_their_position ⇒ Object
#split_enklitikon(encl, restrictors) ⇒ Object
#split_nec ⇒ Object
#split_with_force ⇒ Object
#to_be_shifted_que_indices ⇒ Object
#tokenize(text, add_to: nil, **options) ⇒ Object
#ve_corrections ⇒ Object

Instance Attribute Details

#default_options ⇒ `Object` (readonly)

Returns the value of attribute default_options.



19
20
21

# File 'lib/llt/tokenizer.rb', line 19

def default_options
  @default_options
end

Class Method Details

.default_options ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 21

def self.default_options
  {
    shifting: true,
    enclitics_marker: '-',
    merging: true,
    indexing: true,
  }
end

Instance Method Details

#create_tokens ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 330

def create_tokens
  # call #to_a is to retrieve (and align) optional metrical data
  reset_id
  @worker.to_a.map! do |el|
    case el
    when XML_TAG                  then Token::XmlTag.new(el)
    when ABBR_NAME_WITH_DOT       then raise_id and Token::Filler.new(el, @id)
    when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
    when PUNCT_ITSELF             then raise_id and Token::Punctuation.new(el, @id)
    else                               raise_id and Token::Word.new(el, @id)
    end
  end
end

#enclitic(val) ⇒ `Object`



173
174
175

# File 'lib/llt/tokenizer.rb', line 173

def enclitic(val)
  "#{@enclitics_marker}#{val}"
end

#find_abbreviations_and_join_strings ⇒ `Object`

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }

# File 'lib/llt/tokenizer.rb', line 114

def find_abbreviations_and_join_strings
  arr = []
  @worker.each_with_index do |e, i|
    n = @worker[i + 1]
    if e =~ ABBREVIATIONS && n == "."
      @worker[i + 1] = n.prepend(e)
      arr << (i - arr.size)
    end
  end

  arr.each { |i| @worker.delete_at(i) }
end

#is_a_mergable_pair?(x, y) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/llt/tokenizer.rb', line 312

def is_a_mergable_pair?(x, y)
  # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
  # sentence's first position
  MERGE_WORDS.any? { |a, b| a === x.downcase && b === y  }
end

#is_que?(element) ⇒ `Boolean`

Returns:

(Boolean)



215
216
217

# File 'lib/llt/tokenizer.rb', line 215

def is_que?(element)
  element == enclitic('que')
end

#led_by_preposition?(index) ⇒ `Boolean`

Returns:

(Boolean)



219
220
221

# File 'lib/llt/tokenizer.rb', line 219

def led_by_preposition?(index)
  @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
end

#lookup(string, type, column, inflection_class = 3) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 279

def lookup(string, type, column, inflection_class = 3)
  string = (type == :persona ? string : string.downcase)
  query = {
            type: type, stem_type: column, stem: string,
            restrictions: { type: :inflection_class, values: Array(inflection_class) }
          }
  @db.look_up_stem(query)
end

#make_frequent_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 189

def make_frequent_corrections
  # uses db lookups
  # # TODO 27.11.13 14:15 by LFDM
  # Implement caching here
  ne_corrections
  que_corrections
  ve_corrections
end

#merge_what_needs_merging ⇒ `Object`

quam diu to quamdiu

# File 'lib/llt/tokenizer.rb', line 304

def merge_what_needs_merging
  to_delete = []
  @worker.each_overlapping_pair.each_with_index do |pair, i|
    merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
  end
  to_delete.each { |i| @worker.delete_at(i) }
end

#merge_words(pair, i, to_delete) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 318

def merge_words(pair, i, to_delete)
  pair.first << pair.last
  to_delete  << (i + 1 - to_delete.size)
end

#ne_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 223

def ne_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ne')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
      entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
      entries += lookup(orig_el + "n", :noun, :stem)    if orig_el =~ /d?i$/  # fortitudi-ne ratio-ne libidi-ne homi-ne
      entries += lookup(orig_el + "n", :noun, :stem)    if orig_el =~ /mi$/   # flumi-ne agmi-ne
      entries += lookup(orig_el + "n", :adjective, :stem)                     # communis commune

      if entries.any?(&:third_decl_with_possible_ne_abl?)
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

#original_word(i) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 268

def original_word(i)
  # there are two possible scenarios at this point
  # with shifting enabled:
  #         i  i + 1
  #   arma que virum
  # with shifting disabled:
  #        i - 1  i
  #   arma virum que
  @worker[i + (@shifting ? 1 : -1)]
end

#preliminary ⇒ `Object`



358
359
360

# File 'lib/llt/tokenizer.rb', line 358

def preliminary
  @worker.to_a
end

#put_xml_attributes_back_together(elements) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 84

def put_xml_attributes_back_together(elements)
  # elements could be like this
  # ['<tag', 'attr1="val"', 'attr1="val>']
  # and we want the complete xml tag back together
  as = ArrayScanner.new(elements)
  loop do
    last = as.look_behind
    if last && last.start_with?('<') &! last.end_with?('>')
      if as.current.match(/\w+=".*"$|>/)
        last << ' ' << as.current
        elements.delete_at(as.pos)
        # we don't need to forward, as we delete an element anyway
        next
      end
    else
      as.forward(1)
    end
    break if as.eoa?
  end
end

#que_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 198

def que_corrections
  # this is used in rare only in cases like in eoque
  # which needs a shift to -que in eo
  if @shifting
    to_be_shifted_que_indices.each do |i|
      @worker.insert(i - 1, @worker.delete_at(i))
    end
  end
end

#raise_id ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 348

def raise_id
  if @indexing
    @id += 1
  else
    # need to return true because this is used as first part
    # of an and construction
    true
  end
end

#reset_id ⇒ `Object`



344
345
346

# File 'lib/llt/tokenizer.rb', line 344

def reset_id
  @id = (@indexing ? @id = 0 : nil)
end

#reverse_splittings(indices) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 288

def reverse_splittings(indices)
  indices.each do |i|
    # need to retrieve the orig word before the splitted var is
    # assigned, as it deletes something in the worker
    ow = original_word(i)
    splitted  = @worker.delete_at(i).delete(@enclitics_marker)
    ow << splitted
  end
end

#setup(text, options = {}, worker = []) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 45

def setup(text, options = {}, worker = [])
  @text   = text
  evaluate_metrical_presence(@text)
  @enclitics_marker = parse_option(:enclitics_marker, options)
  @merging          = parse_option(:merging, options)
  @shifting         = parse_option(:shifting, options)
  @indexing         = parse_option(:indexing, options)
  @worker = setup_worker(worker)
  @shift_range = shift_range(@shifting)
end

#setup_worker(worker) ⇒ `Object`

This is here for two reasons:

1) easier test setup, when a preliminary result shall be further evaluated

2) more importantly adding a level of indirection, when
   the given text holds metrical information. It adds a
   substitute implementation for the worker array, but only
   if it's needed - which should perform better, when there
   are no metrics involved (the default case)

# File 'lib/llt/tokenizer.rb', line 66

def setup_worker(worker)
  if worker.any?
    worker
  else
    elements = @text.gsub(PUNCTUATION, ' \0 ').split
    put_xml_attributes_back_together(elements)
    if metrical?
      Worker.new(elements, @enclitics_marker)
    else
      elements
    end
  end
end

#shift_range(shifting_enabled) ⇒ `Object`



80
81
82

# File 'lib/llt/tokenizer.rb', line 80

def shift_range(shifting_enabled)
  shifting_enabled ? 0 : 1
end

#split_enklitika_and_change_their_position ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 142

def split_enklitika_and_change_their_position
  split_with_force
  split_nec
  make_frequent_corrections
end

#split_enklitikon(encl, restrictors) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 158

def split_enklitikon(encl, restrictors)
  # needs a word character in front - ne itself should be contained
  regexp = /(?<=\w)#{encl}$/

  indices = []
  @worker.each_with_index do |token, i|
    if token.match(regexp) && restrictors !~ token
      token.slice!(regexp)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic(encl)) }
end

#split_nec ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 177

def split_nec
  indices = []
  @worker.each_with_index do |token, i|
    if token == 'nec'
      token.slice!(-1)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic('c')) }
end

#split_with_force ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 148

def split_with_force
  # uses brute force at first
  # the restrictor regexps handle only obvious cases

  # don't use c here atm
  ENCLITICS[0..-2].each do |encl|
    split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
  end
end

#to_be_shifted_que_indices ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 208

def to_be_shifted_que_indices
  # double shifts would properly fail, but they  might never happen
  @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
    accumulator << index if is_que?(element) && led_by_preposition?(index)
  end
end

#tokenize(text, add_to: nil, **options) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/llt/tokenizer.rb', line 30

def tokenize(text, add_to: nil, **options)
  raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
  return [] if text.empty?

  setup(text, options)

  find_abbreviations_and_join_strings
  split_enklitika_and_change_their_position
  merge_what_needs_merging if @merging # quam diu => quamdiu
  tokens = create_tokens

  add_to << tokens if add_to.respond_to?(:<<)
  tokens
end

#ve_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 245

def ve_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ve')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
      entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
      entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 5])
      entries += lookup(orig_el + 've', :verb,      :pr,   2)
      entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists


      if entries.any?
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

Class: LLT::Tokenizer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#default_options ⇒ Object (readonly)

Class Method Details

.default_options ⇒ Object

Instance Method Details

#create_tokens ⇒ Object

#enclitic(val) ⇒ Object

#find_abbreviations_and_join_strings ⇒ Object

#is_a_mergable_pair?(x, y) ⇒ Boolean

#is_que?(element) ⇒ Boolean

#led_by_preposition?(index) ⇒ Boolean

#lookup(string, type, column, inflection_class = 3) ⇒ Object

#make_frequent_corrections ⇒ Object

#merge_what_needs_merging ⇒ Object

#merge_words(pair, i, to_delete) ⇒ Object

#ne_corrections ⇒ Object

#original_word(i) ⇒ Object

#preliminary ⇒ Object

#put_xml_attributes_back_together(elements) ⇒ Object

#que_corrections ⇒ Object

#raise_id ⇒ Object

#reset_id ⇒ Object

#reverse_splittings(indices) ⇒ Object

#setup(text, options = {}, worker = []) ⇒ Object

#setup_worker(worker) ⇒ Object

#shift_range(shifting_enabled) ⇒ Object

#split_enklitika_and_change_their_position ⇒ Object

#split_enklitikon(encl, restrictors) ⇒ Object

#split_nec ⇒ Object

#split_with_force ⇒ Object

#to_be_shifted_que_indices ⇒ Object

#tokenize(text, add_to: nil, **options) ⇒ Object

#ve_corrections ⇒ Object

#default_options ⇒ `Object` (readonly)

.default_options ⇒ `Object`

#create_tokens ⇒ `Object`

#enclitic(val) ⇒ `Object`

#find_abbreviations_and_join_strings ⇒ `Object`

#is_a_mergable_pair?(x, y) ⇒ `Boolean`

#is_que?(element) ⇒ `Boolean`

#led_by_preposition?(index) ⇒ `Boolean`

#lookup(string, type, column, inflection_class = 3) ⇒ `Object`

#make_frequent_corrections ⇒ `Object`

#merge_what_needs_merging ⇒ `Object`

#merge_words(pair, i, to_delete) ⇒ `Object`

#ne_corrections ⇒ `Object`

#original_word(i) ⇒ `Object`

#preliminary ⇒ `Object`

#put_xml_attributes_back_together(elements) ⇒ `Object`

#que_corrections ⇒ `Object`

#raise_id ⇒ `Object`

#reset_id ⇒ `Object`

#reverse_splittings(indices) ⇒ `Object`

#setup(text, options = {}, worker = []) ⇒ `Object`

#setup_worker(worker) ⇒ `Object`

#shift_range(shifting_enabled) ⇒ `Object`

#split_enklitika_and_change_their_position ⇒ `Object`

#split_enklitikon(encl, restrictors) ⇒ `Object`

#split_nec ⇒ `Object`

#split_with_force ⇒ `Object`

#to_be_shifted_que_indices ⇒ `Object`

#tokenize(text, add_to: nil, **options) ⇒ `Object`

#ve_corrections ⇒ `Object`