Class: TextExtractor::Record

Inherits:
Object
  • Object
show all
Defined in:
lib/text_extractor/record.rb

Direct Known Subclasses

Filldown

Defined Under Namespace

Classes: FactoryAnalyzer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil) ⇒ Record

Returns a new instance of Record.



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/text_extractor/record.rb', line 7

def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
               inline: [], extractor_values: {}, strip: nil)
  @factory = factory
  @constructor = FactoryAnalyzer.new(factory).to_proc
  @extractor_values = extractor_values
  @values = values.map { |val| [val.id, val] }.to_h
  initialize_inline_values(inline)
  @default_values = values.map { |val| [val.id, nil] }.to_h
  @regexp = build_regexp(regexp, directives, strip)
  @fill = Array(fill)
end

Instance Attribute Details

#factoryObject (readonly)

Returns the value of attribute factory.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def factory
  @factory
end

#regexpObject (readonly)

Returns the value of attribute regexp.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def regexp
  @regexp
end

#valuesObject (readonly)

Returns the value of attribute values.



5
6
7
# File 'lib/text_extractor/record.rb', line 5

def values
  @values
end

Instance Method Details

#build_extraction(extracted) ⇒ Object



26
27
28
29
# File 'lib/text_extractor/record.rb', line 26

def build_extraction(extracted)
  return extracted unless @constructor
  @constructor.call(extracted)
end

#build_regexp(regexp, directives, strip) ⇒ Object



31
32
33
34
35
# File 'lib/text_extractor/record.rb', line 31

def build_regexp(regexp, directives, strip)
  stripped = strip_regexp(regexp, strip)
  expanded = expand_regexp(stripped, directives)
  ignore_regexp(expanded, strip)
end

#expand_regexp(regexp, directives) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/text_extractor/record.rb', line 58

def expand_regexp(regexp, directives)
  if directives
    expander = Directives.new(regexp)
    expanded = expander.expand
    expander.values.each { |value|
      values[value.id] = @extractor_values.fetch(value.id, value)
    }
    expanded
  else
    regexp
  end
end

#extract_fills(fill) ⇒ Object



98
99
100
# File 'lib/text_extractor/record.rb', line 98

def extract_fills(fill)
  @fill.zip(fill.values_at(*@fill)).to_h
end

#extract_values(match) ⇒ Object



102
103
104
# File 'lib/text_extractor/record.rb', line 102

def extract_values(match)
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
end

#extraction(match, fill) ⇒ Object



19
20
21
22
23
24
# File 'lib/text_extractor/record.rb', line 19

def extraction(match, fill)
  extracted = {}.merge!(@default_values)
                .merge!(extract_fills fill)
                .merge!(extract_values match)
  build_extraction(extracted)
end

#ignore_regexp(regexp, strip) ⇒ Object



71
72
73
74
75
# File 'lib/text_extractor/record.rb', line 71

def ignore_regexp(regexp, strip)
  return regexp unless strip
  lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
  Regexp.new(lines.join("\n"), regexp.options)
end

#initialize_inline_values(inline_values) ⇒ Object



106
107
108
109
110
111
# File 'lib/text_extractor/record.rb', line 106

def initialize_inline_values(inline_values)
  inline_values.each do |value|
    @values[value] = @extractor_values
                     .fetch(value) { InlineValue.new(value) }
  end
end

#match(string, pos = 0) ⇒ Object



86
87
88
# File 'lib/text_extractor/record.rb', line 86

def match(string, pos = 0)
  @regexp.match(string, pos)
end

#optionsObject



94
95
96
# File 'lib/text_extractor/record.rb', line 94

def options
  @regexp.options
end

#regexp_line_ignorer(strip) ⇒ Object



77
78
79
80
81
82
83
84
# File 'lib/text_extractor/record.rb', line 77

def regexp_line_ignorer(strip)
  case strip
  when :left  then ->(s) { "\[ \\t\\r\\f]*#{s}" }
  when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
  when :both  then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
  else raise "Unknown ignore whitespace option: #{strip}"
  end
end

#regexp_line_stripper(strip) ⇒ Object



48
49
50
51
52
53
54
55
56
# File 'lib/text_extractor/record.rb', line 48

def regexp_line_stripper(strip)
  case strip
  when :left  then ->(s) { s.lstrip }
  when :right then ->(s) { s.rstrip }
  when :both  then ->(s) { s.strip }
  when nil, false then ->(s) { s }
  else raise "Unknown strip option: #{strip}"
  end
end

#sourceObject



90
91
92
# File 'lib/text_extractor/record.rb', line 90

def source
  @regexp.source
end

#strip_regexp(regexp, strip) ⇒ Object



37
38
39
40
41
42
43
44
45
46
# File 'lib/text_extractor/record.rb', line 37

def strip_regexp(regexp, strip)
  lines = regexp.source.split("\n")
  prefix = lines.last
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
    lines.shift
    lines = lines.map { |s| s.gsub(prefix, '') }
    lines = lines.map(&regexp_line_stripper(strip))
  end
  Regexp.new(lines.join("\n"), regexp.options)
end