Class: LLT::Tokenizer
- Inherits:
-
Object
- Object
- LLT::Tokenizer
- Includes:
- Constants::Abbreviations, Core::Serviceable, Helpers::Metrical
- Defined in:
- lib/llt/tokenizer.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb
Defined Under Namespace
Classes: Worker
Constant Summary collapse
- PUNCTUATION =
/([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
- ABBREVIATIONS =
covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.
/^(#{ALL_ABBRS_PIPED})$/
- WORDS_ENDING_WITH_QUE =
neque taken out!
/^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i
- WORDS_ENDING_WITH_NE =
/^(omne|sine|bene|paene)$/i
- WORDS_ENDING_WITH_VE =
/^(sive|neve)$/i
- ENCLITICS =
laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum
but
uterque, institutione, sive et al. remain
%w{ que ne ve c }
- MERGE_WORDS =
[ %w{ quam diu }, ['non', /null.{1,4}$/]
- ABBR_NAME_WITH_DOT =
/^(#{NAMES_PIPED})\.$/
- ROMAN_DATE_EXPR_WITH_DOT =
/^(#{DATES_PIPED})\.$/
- PUNCT_ITSELF =
Regexp.new(PUNCTUATION.source + '$')
- XML_TAG =
/<\/?.+?>/
- VERSION =
"0.0.2"
Instance Attribute Summary collapse
-
#default_options ⇒ Object
readonly
Returns the value of attribute default_options.
Class Method Summary collapse
Instance Method Summary collapse
- #create_tokens ⇒ Object
- #enclitic(val) ⇒ Object
-
#find_abbreviations_and_join_strings ⇒ Object
%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }.
- #is_a_mergable_pair?(x, y) ⇒ Boolean
- #is_que?(element) ⇒ Boolean
- #led_by_preposition?(index) ⇒ Boolean
- #lookup(string, type, column, inflection_class = 3) ⇒ Object
- #make_frequent_corrections ⇒ Object
-
#merge_what_needs_merging ⇒ Object
quam diu to quamdiu.
- #merge_words(pair, i, to_delete) ⇒ Object
- #ne_corrections ⇒ Object
- #original_word(i) ⇒ Object
- #preliminary ⇒ Object
- #put_xml_attributes_back_together(elements) ⇒ Object
- #que_corrections ⇒ Object
- #raise_id ⇒ Object
- #reset_id ⇒ Object
- #reverse_splittings(indices) ⇒ Object
- #setup(text, options = {}, worker = []) ⇒ Object
-
#setup_worker(worker) ⇒ Object
This is here for two reasons: 1) easier test setup, when a preliminary result shall be further evaluated.
- #shift_range(shifting_enabled) ⇒ Object
- #split_enklitika_and_change_their_position ⇒ Object
- #split_enklitikon(encl, restrictors) ⇒ Object
- #split_nec ⇒ Object
- #split_with_force ⇒ Object
- #to_be_shifted_que_indices ⇒ Object
- #tokenize(text, add_to: nil, **options) ⇒ Object
- #ve_corrections ⇒ Object
Instance Attribute Details
#default_options ⇒ Object (readonly)
Returns the value of attribute default_options.
19 20 21 |
# File 'lib/llt/tokenizer.rb', line 19 def @default_options end |
Class Method Details
.default_options ⇒ Object
21 22 23 24 25 26 27 28 |
# File 'lib/llt/tokenizer.rb', line 21 def self. { shifting: true, enclitics_marker: '-', merging: true, indexing: true, } end |
Instance Method Details
#create_tokens ⇒ Object
330 331 332 333 334 335 336 337 338 339 340 341 342 |
# File 'lib/llt/tokenizer.rb', line 330 def create_tokens # call #to_a is to retrieve (and align) optional metrical data reset_id @worker.to_a.map! do |el| case el when XML_TAG then Token::XmlTag.new(el) when ABBR_NAME_WITH_DOT then raise_id and Token::Filler.new(el, @id) when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id) when PUNCT_ITSELF then raise_id and Token::Punctuation.new(el, @id) else raise_id and Token::Word.new(el, @id) end end end |
#enclitic(val) ⇒ Object
173 174 175 |
# File 'lib/llt/tokenizer.rb', line 173 def enclitic(val) "#{@enclitics_marker}#{val}" end |
#find_abbreviations_and_join_strings ⇒ Object
%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/llt/tokenizer.rb', line 114 def find_abbreviations_and_join_strings arr = [] @worker.each_with_index do |e, i| n = @worker[i + 1] if e =~ ABBREVIATIONS && n == "." @worker[i + 1] = n.prepend(e) arr << (i - arr.size) end end arr.each { |i| @worker.delete_at(i) } end |
#is_a_mergable_pair?(x, y) ⇒ Boolean
312 313 314 315 316 |
# File 'lib/llt/tokenizer.rb', line 312 def is_a_mergable_pair?(x, y) # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a # sentence's first position MERGE_WORDS.any? { |a, b| a === x.downcase && b === y } end |
#is_que?(element) ⇒ Boolean
215 216 217 |
# File 'lib/llt/tokenizer.rb', line 215 def is_que?(element) element == enclitic('que') end |
#led_by_preposition?(index) ⇒ Boolean
219 220 221 |
# File 'lib/llt/tokenizer.rb', line 219 def led_by_preposition?(index) @worker[index - 1] =~ /^(in|ad|ob)$/i # and others end |
#lookup(string, type, column, inflection_class = 3) ⇒ Object
279 280 281 282 283 284 285 286 |
# File 'lib/llt/tokenizer.rb', line 279 def lookup(string, type, column, inflection_class = 3) string = (type == :persona ? string : string.downcase) query = { type: type, stem_type: column, stem: string, restrictions: { type: :inflection_class, values: Array(inflection_class) } } @db.look_up_stem(query) end |
#make_frequent_corrections ⇒ Object
189 190 191 192 193 194 195 196 |
# File 'lib/llt/tokenizer.rb', line 189 def make_frequent_corrections # uses db lookups # # TODO 27.11.13 14:15 by LFDM # Implement caching here ne_corrections que_corrections ve_corrections end |
#merge_what_needs_merging ⇒ Object
quam diu to quamdiu
304 305 306 307 308 309 310 |
# File 'lib/llt/tokenizer.rb', line 304 def merge_what_needs_merging to_delete = [] @worker.each_overlapping_pair.each_with_index do |pair, i| merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair) end to_delete.each { |i| @worker.delete_at(i) } end |
#merge_words(pair, i, to_delete) ⇒ Object
318 319 320 321 |
# File 'lib/llt/tokenizer.rb', line 318 def merge_words(pair, i, to_delete) pair.first << pair.last to_delete << (i + 1 - to_delete.size) end |
#ne_corrections ⇒ Object
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
# File 'lib/llt/tokenizer.rb', line 223 def ne_corrections corrections = [] @worker.each_with_index do |w, i| if w == enclitic('ne') orig_el = original_word(i) entries = [] entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne entries += lookup(orig_el + "n", :adjective, :stem) # communis commune if entries.any?(&:third_decl_with_possible_ne_abl?) corrections << i - corrections.size end end end reverse_splittings(corrections) end |
#original_word(i) ⇒ Object
268 269 270 271 272 273 274 275 276 277 |
# File 'lib/llt/tokenizer.rb', line 268 def original_word(i) # there are two possible scenarios at this point # with shifting enabled: # i i + 1 # arma que virum # with shifting disabled: # i - 1 i # arma virum que @worker[i + (@shifting ? 1 : -1)] end |
#preliminary ⇒ Object
358 359 360 |
# File 'lib/llt/tokenizer.rb', line 358 def preliminary @worker.to_a end |
#put_xml_attributes_back_together(elements) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/llt/tokenizer.rb', line 84 def put_xml_attributes_back_together(elements) # elements could be like this # ['<tag', 'attr1="val"', 'attr1="val>'] # and we want the complete xml tag back together as = ArrayScanner.new(elements) loop do last = as.look_behind if last && last.start_with?('<') &! last.end_with?('>') if as.current.match(/\w+=".*"$|>/) last << ' ' << as.current elements.delete_at(as.pos) # we don't need to forward, as we delete an element anyway next end else as.forward(1) end break if as.eoa? end end |
#que_corrections ⇒ Object
198 199 200 201 202 203 204 205 206 |
# File 'lib/llt/tokenizer.rb', line 198 def que_corrections # this is used in rare only in cases like in eoque # which needs a shift to -que in eo if @shifting to_be_shifted_que_indices.each do |i| @worker.insert(i - 1, @worker.delete_at(i)) end end end |
#raise_id ⇒ Object
348 349 350 351 352 353 354 355 356 |
# File 'lib/llt/tokenizer.rb', line 348 def raise_id if @indexing @id += 1 else # need to return true because this is used as first part # of an and construction true end end |
#reset_id ⇒ Object
344 345 346 |
# File 'lib/llt/tokenizer.rb', line 344 def reset_id @id = (@indexing ? @id = 0 : nil) end |
#reverse_splittings(indices) ⇒ Object
288 289 290 291 292 293 294 295 296 |
# File 'lib/llt/tokenizer.rb', line 288 def reverse_splittings(indices) indices.each do |i| # need to retrieve the orig word before the splitted var is # assigned, as it deletes something in the worker ow = original_word(i) splitted = @worker.delete_at(i).delete(@enclitics_marker) ow << splitted end end |
#setup(text, options = {}, worker = []) ⇒ Object
45 46 47 48 49 50 51 52 53 54 |
# File 'lib/llt/tokenizer.rb', line 45 def setup(text, = {}, worker = []) @text = text evaluate_metrical_presence(@text) @enclitics_marker = parse_option(:enclitics_marker, ) @merging = parse_option(:merging, ) @shifting = parse_option(:shifting, ) @indexing = parse_option(:indexing, ) @worker = setup_worker(worker) @shift_range = shift_range(@shifting) end |
#setup_worker(worker) ⇒ Object
This is here for two reasons:
1) easier test setup, when a preliminary result shall be further evaluated
2) more importantly adding a level of indirection, when
the given text holds metrical information. It adds a
substitute implementation for the worker array, but only
if it's needed - which should perform better, when there
are no metrics involved (the default case)
66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/llt/tokenizer.rb', line 66 def setup_worker(worker) if worker.any? worker else elements = @text.gsub(PUNCTUATION, ' \0 ').split put_xml_attributes_back_together(elements) if metrical? Worker.new(elements, @enclitics_marker) else elements end end end |
#shift_range(shifting_enabled) ⇒ Object
80 81 82 |
# File 'lib/llt/tokenizer.rb', line 80 def shift_range(shifting_enabled) shifting_enabled ? 0 : 1 end |
#split_enklitika_and_change_their_position ⇒ Object
142 143 144 145 146 |
# File 'lib/llt/tokenizer.rb', line 142 def split_enklitika_and_change_their_position split_with_force split_nec make_frequent_corrections end |
#split_enklitikon(encl, restrictors) ⇒ Object
158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/llt/tokenizer.rb', line 158 def split_enklitikon(encl, restrictors) # needs a word character in front - ne itself should be contained regexp = /(?<=\w)#{encl}$/ indices = [] @worker.each_with_index do |token, i| if token.match(regexp) && restrictors !~ token token.slice!(regexp) indices << (i + indices.size + @shift_range) end end indices.each { |i| @worker.insert(i, enclitic(encl)) } end |
#split_nec ⇒ Object
177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/llt/tokenizer.rb', line 177 def split_nec indices = [] @worker.each_with_index do |token, i| if token == 'nec' token.slice!(-1) indices << (i + indices.size + @shift_range) end end indices.each { |i| @worker.insert(i, enclitic('c')) } end |
#split_with_force ⇒ Object
148 149 150 151 152 153 154 155 156 |
# File 'lib/llt/tokenizer.rb', line 148 def split_with_force # uses brute force at first # the restrictor regexps handle only obvious cases # don't use c here atm ENCLITICS[0..-2].each do |encl| split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}")) end end |
#to_be_shifted_que_indices ⇒ Object
208 209 210 211 212 213 |
# File 'lib/llt/tokenizer.rb', line 208 def to_be_shifted_que_indices # double shifts would properly fail, but they might never happen @worker.each_with_index.each_with_object([]) do |(element, index), accumulator| accumulator << index if is_que?(element) && led_by_preposition?(index) end end |
#tokenize(text, add_to: nil, **options) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/llt/tokenizer.rb', line 30 def tokenize(text, add_to: nil, **) raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String) return [] if text.empty? setup(text, ) find_abbreviations_and_join_strings split_enklitika_and_change_their_position merge_what_needs_merging if @merging # quam diu => quamdiu tokens = create_tokens add_to << tokens if add_to.respond_to?(:<<) tokens end |
#ve_corrections ⇒ Object
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
# File 'lib/llt/tokenizer.rb', line 245 def ve_corrections corrections = [] @worker.each_with_index do |w, i| if w == enclitic('ve') orig_el = original_word(i) entries = [] entries += lookup(orig_el + 'v', :adjective, :stem, 1) entries += lookup(orig_el + 'v', :adjective, :stem, 3) entries += lookup(orig_el + 'v', :noun, :stem, [2, 5]) entries += lookup(orig_el + 've', :verb, :pr, 2) entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists if entries.any? corrections << i - corrections.size end end end reverse_splittings(corrections) end |