Class: TextSentencer
- Inherits:
-
Object
- Object
- TextSentencer
- Defined in:
- lib/text_sentencer_c.rb,
ext/text_sentencer_c/text_sentencer.c
Constant Summary collapse
- DEFAULT_RULES =
{ break_pattern: "([ \t]*\n+)+[ \t]*", # one or more consecutive blank lines candidate_pattern: "[ \t]+", positive_rules: [ ["[.!?]$", "^[0-9A-Z]"], [":$", "^[0-9]"], [":$", "^[A-Z][a-z]"] ], negative_rules: [ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.$', '^[A-Z][a-z]'], ['(Sr|Jr)\.$', '^[A-Z][a-z]'], ['\b[A-Z][a-z]*\.$', '^[0-9A-Z]'], ['(cf|vs)\.$', '^.'], ['e\.g\.$', '^.'], ['i\.e\.$', '^.'], ['(Sec|Chap|Fig|Eq)\.$', '^[0-9A-Z]'] ] }
Class Method Summary collapse
Instance Method Summary collapse
-
#annotate(rb_text) ⇒ Object
Annotate method.
- #initialize(rules) ⇒ Object constructor
- #segment(rb_text) ⇒ Object
Constructor Details
#initialize(rules) ⇒ Object
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 |
# File 'ext/text_sentencer_c/text_sentencer.c', line 526 VALUE text_sentencer_initialize(VALUE self, VALUE rules) { TextSentencer *ts; TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts); // Extract and compile the break pattern from the rules hash VALUE rb_break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern"))); if (!NIL_P(rb_break_pattern)) { ts->break_pattern = compile_pattern(rb_break_pattern); } else { rb_raise(rb_eArgError, "break_pattern is required in rules"); } // Extract and compile the candidate pattern from the rules hash VALUE rb_candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern"))); if (!NIL_P(rb_candidate_pattern)) { ts->candidate_pattern = compile_pattern(rb_candidate_pattern); } else { rb_raise(rb_eArgError, "candidate_pattern is required in rules"); } // Compile positive rules VALUE rb_positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules"))); if (NIL_P(rb_positive_rules) || TYPE(rb_positive_rules) != T_ARRAY) { rb_raise(rb_eArgError, "positive_rules must be an array"); } ts->num_positive_rules = RARRAY_LEN(rb_positive_rules); ts->positive_rules_pre = malloc(ts->num_positive_rules * sizeof(URegularExpression *)); ts->positive_rules_post = malloc(ts->num_positive_rules * sizeof(URegularExpression *)); if ((ts->positive_rules_pre == NULL) || (ts->positive_rules_post == NULL)) { rb_raise(rb_eNoMemError, "Failed to allocate memory for positive rules"); } for (size_t i = 0; i < ts->num_positive_rules; i++) { VALUE rb_rule = rb_ary_entry(rb_positive_rules, i); VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0); VALUE rb_rule_post = rb_ary_entry(rb_rule, 1); ts->positive_rules_pre[i] = compile_pattern(rb_rule_pre); ts->positive_rules_post[i] = compile_pattern(rb_rule_post); } // Compile negative rules VALUE rb_negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules"))); if (NIL_P(rb_negative_rules) || TYPE(rb_negative_rules) != T_ARRAY) { rb_raise(rb_eArgError, "negative_rules must be an array"); } ts->num_negative_rules = RARRAY_LEN(rb_negative_rules); ts->negative_rules_pre = malloc(ts->num_negative_rules * sizeof(URegularExpression *)); ts->negative_rules_post = malloc(ts->num_negative_rules * sizeof(URegularExpression *)); if ((ts->negative_rules_pre == NULL) || (ts->negative_rules_post == NULL)) { rb_raise(rb_eNoMemError, "Failed to allocate memory for negative rules"); } for (size_t i = 0; i < ts->num_negative_rules; i++) { VALUE rb_rule = rb_ary_entry(rb_negative_rules, i); VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0); VALUE rb_rule_post = rb_ary_entry(rb_rule, 1); ts->negative_rules_pre[i] = compile_pattern(rb_rule_pre); ts->negative_rules_post[i] = compile_pattern(rb_rule_post); } return self; } |
Class Method Details
.update_rules(rules) ⇒ Object
23 24 25 |
# File 'lib/text_sentencer_c.rb', line 23 def self.update_rules(rules) DEFAULT_RULES.merge(rules) end |
Instance Method Details
#annotate(rb_text) ⇒ Object
Annotate method
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 |
# File 'ext/text_sentencer_c/text_sentencer.c', line 445 VALUE text_sentencer_annotate(VALUE self, VALUE rb_text) { Check_Type(rb_text, T_STRING); VALUE segments = text_sentencer_segment(self, rb_text); VALUE blocks = rb_ary_new(); for (long i = 0; i < RARRAY_LEN(segments); i++) { VALUE span = rb_hash_new(); VALUE block = rb_hash_new(); VALUE segment = rb_ary_entry(segments, i); long start = NUM2LONG(rb_ary_entry(segment, 0)); long end = NUM2LONG(rb_ary_entry(segment, 1)); rb_hash_aset(span, ID2SYM(rb_intern("begin")), LONG2NUM(start)); rb_hash_aset(span, ID2SYM(rb_intern("end")), LONG2NUM(end)); rb_hash_aset(block, ID2SYM(rb_intern("span")), span); rb_hash_aset(block, ID2SYM(rb_intern("obj")), rb_str_new_cstr("Sentence")); rb_ary_push(blocks, block); } VALUE result = rb_hash_new(); rb_hash_aset(result, ID2SYM(rb_intern("text")), rb_text); rb_hash_aset(result, ID2SYM(rb_intern("blocks")), blocks); return result; } |
#segment(rb_text) ⇒ Object
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 |
# File 'ext/text_sentencer_c/text_sentencer.c', line 416 VALUE text_sentencer_segment(VALUE self, VALUE rb_text) { TextSentencer *ts; TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts); Check_Type(rb_text, T_STRING); const char *text = StringValueCStr(rb_text); long *segment_starts = NULL; long *segment_ends = NULL; long num_segments = 0; struct segment_args args = {ts, text, segment_starts, segment_ends, num_segments}; rb_thread_call_without_gvl(segment_without_gvl, &args, RUBY_UBF_IO, NULL); VALUE segments = rb_ary_new(); for (long i = 0; i < args.num_segments; ++i) { VALUE segment = rb_ary_new(); rb_ary_push(segment, LONG2NUM(args.segment_starts[i])); rb_ary_push(segment, LONG2NUM(args.segment_ends[i])); rb_ary_push(segments, segment); } free(args.segment_starts); free(args.segment_ends); return segments; } |