Class: LLT::Tokenizer

Inherits:
Object
  • Object
show all
Includes:
Constants::Abbreviations, Core::Serviceable, Helpers::Metrical
Defined in:
lib/llt/tokenizer.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb

Defined Under Namespace

Classes: Worker

Constant Summary collapse

PUNCTUATION =
/([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
ABBREVIATIONS =

covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.

/^(#{ALL_ABBRS_PIPED})$/
WORDS_ENDING_WITH_QUE =

neque taken out!

/^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i
WORDS_ENDING_WITH_NE =
/^(omne|sine|bene|paene)$/i
WORDS_ENDING_WITH_VE =
/^(sive|neve)$/i
ENCLITICS =

laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum

but

uterque, institutione, sive et al. remain

%w{ que ne ve c }
MERGE_WORDS =
[ %w{ quam diu }, ['non', /null.{1,4}$/]
ABBR_NAME_WITH_DOT =
/^(#{NAMES_PIPED})\.$/
ROMAN_DATE_EXPR_WITH_DOT =
/^(#{DATES_PIPED})\.$/
PUNCT_ITSELF =
Regexp.new(PUNCTUATION.source + '$')
XML_TAG =
/<\/?.+?>/
VERSION =
"0.0.2"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#default_optionsObject (readonly)

Returns the value of attribute default_options.



19
20
21
# File 'lib/llt/tokenizer.rb', line 19

def default_options
  @default_options
end

Class Method Details

.default_optionsObject



21
22
23
24
25
26
27
28
# File 'lib/llt/tokenizer.rb', line 21

def self.default_options
  {
    shifting: true,
    enclitics_marker: '-',
    merging: true,
    indexing: true,
  }
end

Instance Method Details

#create_tokensObject



330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/llt/tokenizer.rb', line 330

def create_tokens
  # call #to_a is to retrieve (and align) optional metrical data
  reset_id
  @worker.to_a.map! do |el|
    case el
    when XML_TAG                  then Token::XmlTag.new(el)
    when ABBR_NAME_WITH_DOT       then raise_id and Token::Filler.new(el, @id)
    when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
    when PUNCT_ITSELF             then raise_id and Token::Punctuation.new(el, @id)
    else                               raise_id and Token::Word.new(el, @id)
    end
  end
end

#enclitic(val) ⇒ Object



173
174
175
# File 'lib/llt/tokenizer.rb', line 173

def enclitic(val)
  "#{@enclitics_marker}#{val}"
end

#find_abbreviations_and_join_stringsObject

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }



114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/llt/tokenizer.rb', line 114

def find_abbreviations_and_join_strings
  arr = []
  @worker.each_with_index do |e, i|
    n = @worker[i + 1]
    if e =~ ABBREVIATIONS && n == "."
      @worker[i + 1] = n.prepend(e)
      arr << (i - arr.size)
    end
  end

  arr.each { |i| @worker.delete_at(i) }
end

#is_a_mergable_pair?(x, y) ⇒ Boolean

Returns:

  • (Boolean)


312
313
314
315
316
# File 'lib/llt/tokenizer.rb', line 312

def is_a_mergable_pair?(x, y)
  # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
  # sentence's first position
  MERGE_WORDS.any? { |a, b| a === x.downcase && b === y  }
end

#is_que?(element) ⇒ Boolean

Returns:

  • (Boolean)


215
216
217
# File 'lib/llt/tokenizer.rb', line 215

def is_que?(element)
  element == enclitic('que')
end

#led_by_preposition?(index) ⇒ Boolean

Returns:

  • (Boolean)


219
220
221
# File 'lib/llt/tokenizer.rb', line 219

def led_by_preposition?(index)
  @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
end

#lookup(string, type, column, inflection_class = 3) ⇒ Object



279
280
281
282
283
284
285
286
# File 'lib/llt/tokenizer.rb', line 279

def lookup(string, type, column, inflection_class = 3)
  string = (type == :persona ? string : string.downcase)
  query = {
            type: type, stem_type: column, stem: string,
            restrictions: { type: :inflection_class, values: Array(inflection_class) }
          }
  @db.look_up_stem(query)
end

#make_frequent_correctionsObject



189
190
191
192
193
194
195
196
# File 'lib/llt/tokenizer.rb', line 189

def make_frequent_corrections
  # uses db lookups
  # # TODO 27.11.13 14:15 by LFDM
  # Implement caching here
  ne_corrections
  que_corrections
  ve_corrections
end

#merge_what_needs_mergingObject

quam diu to quamdiu



304
305
306
307
308
309
310
# File 'lib/llt/tokenizer.rb', line 304

def merge_what_needs_merging
  to_delete = []
  @worker.each_overlapping_pair.each_with_index do |pair, i|
    merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
  end
  to_delete.each { |i| @worker.delete_at(i) }
end

#merge_words(pair, i, to_delete) ⇒ Object



318
319
320
321
# File 'lib/llt/tokenizer.rb', line 318

def merge_words(pair, i, to_delete)
  pair.first << pair.last
  to_delete  << (i + 1 - to_delete.size)
end

#ne_correctionsObject



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/llt/tokenizer.rb', line 223

def ne_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ne')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
      entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
      entries += lookup(orig_el + "n", :noun, :stem)    if orig_el =~ /d?i$/  # fortitudi-ne ratio-ne libidi-ne homi-ne
      entries += lookup(orig_el + "n", :noun, :stem)    if orig_el =~ /mi$/   # flumi-ne agmi-ne
      entries += lookup(orig_el + "n", :adjective, :stem)                     # communis commune

      if entries.any?(&:third_decl_with_possible_ne_abl?)
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

#original_word(i) ⇒ Object



268
269
270
271
272
273
274
275
276
277
# File 'lib/llt/tokenizer.rb', line 268

def original_word(i)
  # there are two possible scenarios at this point
  # with shifting enabled:
  #         i  i + 1
  #   arma que virum
  # with shifting disabled:
  #        i - 1  i
  #   arma virum que
  @worker[i + (@shifting ? 1 : -1)]
end

#preliminaryObject



358
359
360
# File 'lib/llt/tokenizer.rb', line 358

def preliminary
  @worker.to_a
end

#put_xml_attributes_back_together(elements) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/llt/tokenizer.rb', line 84

def put_xml_attributes_back_together(elements)
  # elements could be like this
  # ['<tag', 'attr1="val"', 'attr1="val>']
  # and we want the complete xml tag back together
  as = ArrayScanner.new(elements)
  loop do
    last = as.look_behind
    if last && last.start_with?('<') &! last.end_with?('>')
      if as.current.match(/\w+=".*"$|>/)
        last << ' ' << as.current
        elements.delete_at(as.pos)
        # we don't need to forward, as we delete an element anyway
        next
      end
    else
      as.forward(1)
    end
    break if as.eoa?
  end
end

#que_correctionsObject



198
199
200
201
202
203
204
205
206
# File 'lib/llt/tokenizer.rb', line 198

def que_corrections
  # this is used in rare only in cases like in eoque
  # which needs a shift to -que in eo
  if @shifting
    to_be_shifted_que_indices.each do |i|
      @worker.insert(i - 1, @worker.delete_at(i))
    end
  end
end

#raise_idObject



348
349
350
351
352
353
354
355
356
# File 'lib/llt/tokenizer.rb', line 348

def raise_id
  if @indexing
    @id += 1
  else
    # need to return true because this is used as first part
    # of an and construction
    true
  end
end

#reset_idObject



344
345
346
# File 'lib/llt/tokenizer.rb', line 344

def reset_id
  @id = (@indexing ? @id = 0 : nil)
end

#reverse_splittings(indices) ⇒ Object



288
289
290
291
292
293
294
295
296
# File 'lib/llt/tokenizer.rb', line 288

def reverse_splittings(indices)
  indices.each do |i|
    # need to retrieve the orig word before the splitted var is
    # assigned, as it deletes something in the worker
    ow = original_word(i)
    splitted  = @worker.delete_at(i).delete(@enclitics_marker)
    ow << splitted
  end
end

#setup(text, options = {}, worker = []) ⇒ Object



45
46
47
48
49
50
51
52
53
54
# File 'lib/llt/tokenizer.rb', line 45

def setup(text, options = {}, worker = [])
  @text   = text
  evaluate_metrical_presence(@text)
  @enclitics_marker = parse_option(:enclitics_marker, options)
  @merging          = parse_option(:merging, options)
  @shifting         = parse_option(:shifting, options)
  @indexing         = parse_option(:indexing, options)
  @worker = setup_worker(worker)
  @shift_range = shift_range(@shifting)
end

#setup_worker(worker) ⇒ Object

This is here for two reasons:

1) easier test setup, when a preliminary result shall be further evaluated

2) more importantly adding a level of indirection, when
   the given text holds metrical information. It adds a
   substitute implementation for the worker array, but only
   if it's needed - which should perform better, when there
   are no metrics involved (the default case)


66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/llt/tokenizer.rb', line 66

def setup_worker(worker)
  if worker.any?
    worker
  else
    elements = @text.gsub(PUNCTUATION, ' \0 ').split
    put_xml_attributes_back_together(elements)
    if metrical?
      Worker.new(elements, @enclitics_marker)
    else
      elements
    end
  end
end

#shift_range(shifting_enabled) ⇒ Object



80
81
82
# File 'lib/llt/tokenizer.rb', line 80

def shift_range(shifting_enabled)
  shifting_enabled ? 0 : 1
end

#split_enklitika_and_change_their_positionObject



142
143
144
145
146
# File 'lib/llt/tokenizer.rb', line 142

def split_enklitika_and_change_their_position
  split_with_force
  split_nec
  make_frequent_corrections
end

#split_enklitikon(encl, restrictors) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/llt/tokenizer.rb', line 158

def split_enklitikon(encl, restrictors)
  # needs a word character in front - ne itself should be contained
  regexp = /(?<=\w)#{encl}$/

  indices = []
  @worker.each_with_index do |token, i|
    if token.match(regexp) && restrictors !~ token
      token.slice!(regexp)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic(encl)) }
end

#split_necObject



177
178
179
180
181
182
183
184
185
186
187
# File 'lib/llt/tokenizer.rb', line 177

def split_nec
  indices = []
  @worker.each_with_index do |token, i|
    if token == 'nec'
      token.slice!(-1)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic('c')) }
end

#split_with_forceObject



148
149
150
151
152
153
154
155
156
# File 'lib/llt/tokenizer.rb', line 148

def split_with_force
  # uses brute force at first
  # the restrictor regexps handle only obvious cases

  # don't use c here atm
  ENCLITICS[0..-2].each do |encl|
    split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
  end
end

#to_be_shifted_que_indicesObject



208
209
210
211
212
213
# File 'lib/llt/tokenizer.rb', line 208

def to_be_shifted_que_indices
  # double shifts would properly fail, but they  might never happen
  @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
    accumulator << index if is_que?(element) && led_by_preposition?(index)
  end
end

#tokenize(text, add_to: nil, **options) ⇒ Object

Raises:

  • (ArgumentError)


30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/llt/tokenizer.rb', line 30

def tokenize(text, add_to: nil, **options)
  raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
  return [] if text.empty?

  setup(text, options)

  find_abbreviations_and_join_strings
  split_enklitika_and_change_their_position
  merge_what_needs_merging if @merging # quam diu => quamdiu
  tokens = create_tokens

  add_to << tokens if add_to.respond_to?(:<<)
  tokens
end

#ve_correctionsObject



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# File 'lib/llt/tokenizer.rb', line 245

def ve_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ve')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
      entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
      entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 5])
      entries += lookup(orig_el + 've', :verb,      :pr,   2)
      entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists


      if entries.any?
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end