Class: TextSentencer

Inherits:
Object
  • Object
show all
Defined in:
lib/text_sentencer_c.rb,
ext/text_sentencer_c/text_sentencer.c

Constant Summary collapse

DEFAULT_RULES =
{
  break_pattern: "([ \t]*\n+)+[ \t]*", # one or more consecutive blank lines
  candidate_pattern: "[ \t]+",
  positive_rules: [
    ["[.!?]$", "^[0-9A-Z]"],
    [":$", "^[0-9]"],
    [":$", "^[A-Z][a-z]"]
  ],
  negative_rules: [
    ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.$', '^[A-Z][a-z]'],
    ['(Sr|Jr)\.$', '^[A-Z][a-z]'],
    ['\b[A-Z][a-z]*\.$', '^[0-9A-Z]'],
    ['(cf|vs)\.$', '^.'],
    ['e\.g\.$', '^.'],
    ['i\.e\.$', '^.'],
    ['(Sec|Chap|Fig|Eq)\.$', '^[0-9A-Z]']
  ]
}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(rules) ⇒ Object



526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
# File 'ext/text_sentencer_c/text_sentencer.c', line 526

VALUE text_sentencer_initialize(VALUE self, VALUE rules) {
  TextSentencer *ts;
  TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);

  // Extract and compile the break pattern from the rules hash
  VALUE rb_break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern")));

  if (!NIL_P(rb_break_pattern)) {
    ts->break_pattern = compile_pattern(rb_break_pattern);
  } else {
    rb_raise(rb_eArgError, "break_pattern is required in rules");
  }

  // Extract and compile the candidate pattern from the rules hash
  VALUE rb_candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern")));

  if (!NIL_P(rb_candidate_pattern)) {
    ts->candidate_pattern = compile_pattern(rb_candidate_pattern);
  } else {
    rb_raise(rb_eArgError, "candidate_pattern is required in rules");
  }

  // Compile positive rules
  VALUE rb_positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules")));
  if (NIL_P(rb_positive_rules) || TYPE(rb_positive_rules) != T_ARRAY) {
    rb_raise(rb_eArgError, "positive_rules must be an array");
  }

  ts->num_positive_rules = RARRAY_LEN(rb_positive_rules);
  ts->positive_rules_pre = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
  ts->positive_rules_post = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
  if ((ts->positive_rules_pre == NULL) || (ts->positive_rules_post == NULL)) {
    rb_raise(rb_eNoMemError, "Failed to allocate memory for positive rules");
  }

  for (size_t i = 0; i < ts->num_positive_rules; i++) {
    VALUE rb_rule = rb_ary_entry(rb_positive_rules, i);
    VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
    VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
    ts->positive_rules_pre[i] = compile_pattern(rb_rule_pre);
    ts->positive_rules_post[i] = compile_pattern(rb_rule_post);
  }

  // Compile negative rules
  VALUE rb_negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules")));
  if (NIL_P(rb_negative_rules) || TYPE(rb_negative_rules) != T_ARRAY) {
    rb_raise(rb_eArgError, "negative_rules must be an array");
  }

  ts->num_negative_rules = RARRAY_LEN(rb_negative_rules);
  ts->negative_rules_pre = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
  ts->negative_rules_post = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
  if ((ts->negative_rules_pre == NULL) || (ts->negative_rules_post == NULL)) {
    rb_raise(rb_eNoMemError, "Failed to allocate memory for negative rules");
  }

  for (size_t i = 0; i < ts->num_negative_rules; i++) {
    VALUE rb_rule = rb_ary_entry(rb_negative_rules, i);
    VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
    VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
    ts->negative_rules_pre[i] = compile_pattern(rb_rule_pre);
    ts->negative_rules_post[i] = compile_pattern(rb_rule_post);
  }

  return self;
}

Class Method Details

.update_rules(rules) ⇒ Object



23
24
25
# File 'lib/text_sentencer_c.rb', line 23

def self.update_rules(rules)
  DEFAULT_RULES.merge(rules)
end

Instance Method Details

#annotate(rb_text) ⇒ Object

Annotate method



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
# File 'ext/text_sentencer_c/text_sentencer.c', line 445

VALUE text_sentencer_annotate(VALUE self, VALUE rb_text) {
  Check_Type(rb_text, T_STRING);

  VALUE segments = text_sentencer_segment(self, rb_text);
  VALUE blocks = rb_ary_new();

  for (long i = 0; i < RARRAY_LEN(segments); i++) {
    VALUE span = rb_hash_new();
    VALUE block = rb_hash_new();

    VALUE segment = rb_ary_entry(segments, i);
    long start = NUM2LONG(rb_ary_entry(segment, 0));
    long end = NUM2LONG(rb_ary_entry(segment, 1));
    
    rb_hash_aset(span, ID2SYM(rb_intern("begin")), LONG2NUM(start));
    rb_hash_aset(span, ID2SYM(rb_intern("end")), LONG2NUM(end));
    
    rb_hash_aset(block, ID2SYM(rb_intern("span")), span);
    rb_hash_aset(block, ID2SYM(rb_intern("obj")), rb_str_new_cstr("Sentence"));
    
    rb_ary_push(blocks, block);
  }

  VALUE result = rb_hash_new();
  rb_hash_aset(result, ID2SYM(rb_intern("text")), rb_text);
  rb_hash_aset(result, ID2SYM(rb_intern("blocks")), blocks);

  return result;
}

#segment(rb_text) ⇒ Object



416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# File 'ext/text_sentencer_c/text_sentencer.c', line 416

VALUE text_sentencer_segment(VALUE self, VALUE rb_text) {
  TextSentencer *ts;
  TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);

  Check_Type(rb_text, T_STRING);
  const char *text = StringValueCStr(rb_text);

  long *segment_starts = NULL;
  long *segment_ends = NULL;
  long num_segments = 0;

  struct segment_args args = {ts, text, segment_starts, segment_ends, num_segments};
  rb_thread_call_without_gvl(segment_without_gvl, &args, RUBY_UBF_IO, NULL);

  VALUE segments = rb_ary_new();
  for (long i = 0; i < args.num_segments; ++i) {
    VALUE segment = rb_ary_new();
    rb_ary_push(segment, LONG2NUM(args.segment_starts[i]));
    rb_ary_push(segment, LONG2NUM(args.segment_ends[i]));
    rb_ary_push(segments, segment);
  }

  free(args.segment_starts);
  free(args.segment_ends);

  return segments;
}