Class: TextExtractor::Record

Inherits:
Object
  • Object
show all
Defined in:
lib/text_extractor/record.rb

Direct Known Subclasses

Filldown, Guard, Skip

Defined Under Namespace

Classes: FactoryAnalyzer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil, **_kwargs) ⇒ Record

rubocop: disable Metrics/ParameterLists



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/text_extractor/record.rb', line 8

def initialize(
      regexp,
      factory: nil,
      values: [],
      fill: [],
      directives: true,
      inline: [],
      extractor_values: {},
      strip: nil,
      **_kwargs
)
  @factory = factory
  @constructor = FactoryAnalyzer.new(factory).to_proc
  @extractor_values = extractor_values
  @values = values.map { |val| [val.id, val] }.to_h
  initialize_inline_values(inline)
  @default_values = values.map { |val| [val.id, nil] }.to_h
  @regexp = build_regexp(regexp, directives, strip)
  @fill = Array(fill)
end

Instance Attribute Details

#factoryObject (readonly)

Returns the value of attribute factory.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def factory
  @factory
end

#regexpObject (readonly)

Returns the value of attribute regexp.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def regexp
  @regexp
end

#valuesObject (readonly)

Returns the value of attribute values.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def values
  @values
end

Instance Method Details

#build_extraction(extracted) ⇒ Object



38
39
40
41
# File 'lib/text_extractor/record.rb', line 38

def build_extraction(extracted)
  return extracted unless @constructor
  @constructor.call(extracted)
end

#build_regexp(regexp, directives, strip) ⇒ Object



43
44
45
46
47
# File 'lib/text_extractor/record.rb', line 43

def build_regexp(regexp, directives, strip)
  stripped = strip_regexp(regexp, strip)
  expanded = expand_regexp(stripped, directives)
  ignore_regexp(expanded, strip)
end

#expand_regexp(regexp, directives) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/text_extractor/record.rb', line 70

def expand_regexp(regexp, directives)
  if directives
    expander = Directives.new(regexp)
    expanded = expander.expand
    expander.values.each do |value|
      values[value.id] = @extractor_values.fetch(value.id, value)
    end
    expanded
  else
    regexp
  end
end

#extract_fills(fill) ⇒ Object



110
111
112
# File 'lib/text_extractor/record.rb', line 110

def extract_fills(fill)
  @fill.zip(fill.values_at(*@fill)).to_h
end

#extract_values(match) ⇒ Object



114
115
116
# File 'lib/text_extractor/record.rb', line 114

def extract_values(match)
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
end

#extraction(match, fill) ⇒ Object

Returns Array.

Returns:

  • Array



31
32
33
34
35
36
# File 'lib/text_extractor/record.rb', line 31

def extraction(match, fill)
  extracted = {}.merge!(@default_values)
                .merge!(extract_fills(fill))
                .merge!(extract_values(match))
  [build_extraction(extracted)]
end

#ignore_regexp(regexp, strip) ⇒ Object



83
84
85
86
87
# File 'lib/text_extractor/record.rb', line 83

def ignore_regexp(regexp, strip)
  return regexp unless strip
  lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
  Regexp.new(lines.join("\n"), regexp.options)
end

#initialize_inline_values(inline_values) ⇒ Object



118
119
120
121
122
123
# File 'lib/text_extractor/record.rb', line 118

def initialize_inline_values(inline_values)
  inline_values.each do |value|
    @values[value] = @extractor_values
                     .fetch(value) { InlineValue.new(value) }
  end
end

#match(string, pos = 0) ⇒ Object



98
99
100
# File 'lib/text_extractor/record.rb', line 98

def match(string, pos = 0)
  @regexp.match(string, pos)
end

#optionsObject



106
107
108
# File 'lib/text_extractor/record.rb', line 106

def options
  @regexp.options
end

#regexp_line_ignorer(strip) ⇒ Object



89
90
91
92
93
94
95
96
# File 'lib/text_extractor/record.rb', line 89

def regexp_line_ignorer(strip)
  case strip
  when :left  then ->(s) { "\[ \\t\\r\\f]*#{s}" }
  when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
  when :both  then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
  else raise "Unknown ignore whitespace option: #{strip}"
  end
end

#regexp_line_stripper(strip) ⇒ Object



60
61
62
63
64
65
66
67
68
# File 'lib/text_extractor/record.rb', line 60

def regexp_line_stripper(strip)
  case strip
  when :left  then ->(s) { s.lstrip }
  when :right then ->(s) { s.rstrip }
  when :both  then ->(s) { s.strip }
  when nil, false then ->(s) { s }
  else raise "Unknown strip option: #{strip}"
  end
end

#sourceObject



102
103
104
# File 'lib/text_extractor/record.rb', line 102

def source
  @regexp.source
end

#strip_regexp(regexp, strip) ⇒ Object



49
50
51
52
53
54
55
56
57
58
# File 'lib/text_extractor/record.rb', line 49

def strip_regexp(regexp, strip)
  lines = regexp.source.split("\n")
  prefix = lines.last
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
    lines.shift
    lines = lines.map { |s| s.gsub(prefix, '') }
    lines = lines.map(&regexp_line_stripper(strip))
  end
  Regexp.new(lines.join("\n"), regexp.options)
end