Class: Gliner::StructuredExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/gliner/structured_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(span_extractor) ⇒ StructuredExtractor

Returns a new instance of StructuredExtractor.



5
6
7
# File 'lib/gliner/structured_extractor.rb', line 5

def initialize(span_extractor)
  @span_extractor = span_extractor
end

Instance Method Details

#apply_choice_filters(spans_by_label, parsed_fields) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/gliner/structured_extractor.rb', line 9

def apply_choice_filters(spans_by_label, parsed_fields)
  filtered = spans_by_label.transform_values(&:dup)

  parsed_fields.each do |field|
    next unless field[:choices]&.any?

    label = field[:name]
    spans = filtered.fetch(label, [])
    filtered[label] = filter_spans_by_choices(spans, field[:choices])
  end

  filtered
end

#build_structure_instances(parsed_fields, spans_by_label, opts) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
# File 'lib/gliner/structured_extractor.rb', line 34

def build_structure_instances(parsed_fields, spans_by_label, opts)
  format_opts = FormatOptions.from(opts)
  anchor_field = anchor_field_for(parsed_fields)
  return [{}] unless anchor_field

  anchors = spans_by_label.fetch(anchor_field[:name], [])
  return [format_structure_object(parsed_fields, spans_by_label, format_opts)] if anchors.empty?

  instance_spans = build_instance_spans(anchors, spans_by_label)
  format_instances(parsed_fields, instance_spans, format_opts)
end

#filter_spans_by_choices(spans, choices) ⇒ Object



23
24
25
26
27
28
29
30
31
32
# File 'lib/gliner/structured_extractor.rb', line 23

def filter_spans_by_choices(spans, choices)
  return spans if spans.empty? || choices.nil? || choices.empty?

  normalized_choices = choices.map { |choice| normalize_choice(choice) }
  matched = spans.select { |span| normalized_choices.include?(normalize_choice(span.text)) }

  return spans if matched.empty?

  matched
end

#format_structure_object(parsed_fields, spans_by_label, opts) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/gliner/structured_extractor.rb', line 46

def format_structure_object(parsed_fields, spans_by_label, opts)
  obj = {}

  parsed_fields.each do |field|
    key = field[:name]
    spans = spans_by_label.fetch(key, [])

    if field[:dtype] == :str
      best = @span_extractor.choose_best_span(spans)
      obj[key] = @span_extractor.format_single_span(best, opts)
    else
      obj[key] = @span_extractor.format_spans(spans, opts)
    end
  end

  obj
end