Class: Gliner::StructuredExtractor
- Inherits:
-
Object
- Object
- Gliner::StructuredExtractor
- Defined in:
- lib/gliner/structured_extractor.rb
Instance Method Summary collapse
- #apply_choice_filters(spans_by_label, parsed_fields) ⇒ Object
- #build_structure_instances(parsed_fields, spans_by_label, opts) ⇒ Object
- #filter_spans_by_choices(spans, choices) ⇒ Object
- #format_structure_object(parsed_fields, spans_by_label, opts) ⇒ Object
-
#initialize(span_extractor) ⇒ StructuredExtractor
constructor
A new instance of StructuredExtractor.
Constructor Details
#initialize(span_extractor) ⇒ StructuredExtractor
Returns a new instance of StructuredExtractor.
5 6 7 |
# File 'lib/gliner/structured_extractor.rb', line 5 def initialize(span_extractor) @span_extractor = span_extractor end |
Instance Method Details
#apply_choice_filters(spans_by_label, parsed_fields) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/gliner/structured_extractor.rb', line 9 def apply_choice_filters(spans_by_label, parsed_fields) filtered = spans_by_label.transform_values(&:dup) parsed_fields.each do |field| next unless field[:choices]&.any? label = field[:name] spans = filtered.fetch(label, []) filtered[label] = filter_spans_by_choices(spans, field[:choices]) end filtered end |
#build_structure_instances(parsed_fields, spans_by_label, opts) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/gliner/structured_extractor.rb', line 34 def build_structure_instances(parsed_fields, spans_by_label, opts) format_opts = FormatOptions.from(opts) anchor_field = anchor_field_for(parsed_fields) return [{}] unless anchor_field anchors = spans_by_label.fetch(anchor_field[:name], []) return [format_structure_object(parsed_fields, spans_by_label, format_opts)] if anchors.empty? instance_spans = build_instance_spans(anchors, spans_by_label) format_instances(parsed_fields, instance_spans, format_opts) end |
#filter_spans_by_choices(spans, choices) ⇒ Object
23 24 25 26 27 28 29 30 31 32 |
# File 'lib/gliner/structured_extractor.rb', line 23 def filter_spans_by_choices(spans, choices) return spans if spans.empty? || choices.nil? || choices.empty? normalized_choices = choices.map { |choice| normalize_choice(choice) } matched = spans.select { |span| normalized_choices.include?(normalize_choice(span.text)) } return spans if matched.empty? matched end |
#format_structure_object(parsed_fields, spans_by_label, opts) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/gliner/structured_extractor.rb', line 46 def format_structure_object(parsed_fields, spans_by_label, opts) obj = {} parsed_fields.each do |field| key = field[:name] spans = spans_by_label.fetch(key, []) if field[:dtype] == :str best = @span_extractor.choose_best_span(spans) obj[key] = @span_extractor.format_single_span(best, opts) else obj[key] = @span_extractor.format_spans(spans, opts) end end obj end |