Class: TextExtractor::Record

Inherits:

Object

Object
TextExtractor::Record

show all

Defined in:: lib/text_extractor/record.rb

Direct Known Subclasses

Filldown, Guard, Skip

Defined Under Namespace

Classes: FactoryAnalyzer

Instance Attribute Summary collapse

#factory ⇒ Object readonly

Returns the value of attribute factory.
#regexp ⇒ Object readonly

Returns the value of attribute regexp.
#values ⇒ Object readonly

Returns the value of attribute values.

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil, **_kwargs) ⇒ `Record`

rubocop: disable Metrics/ParameterLists

# File 'lib/text_extractor/record.rb', line 8

def initialize(
      regexp,
      factory: nil,
      values: [],
      fill: [],
      directives: true,
      inline: [],
      extractor_values: {},
      strip: nil,
      **_kwargs
)
  @factory = factory
  @constructor = FactoryAnalyzer.new(factory).to_proc
  @extractor_values = extractor_values
  @values = values.map { |val| [val.id, val] }.to_h
  initialize_inline_values(inline)
  @default_values = values.map { |val| [val.id, nil] }.to_h
  @regexp = build_regexp(regexp, directives, strip)
  @fill = Array(fill)
end

Instance Attribute Details

#factory ⇒ `Object` (readonly)

Returns the value of attribute factory.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def factory
  @factory
end

#regexp ⇒ `Object` (readonly)

Returns the value of attribute regexp.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def regexp
  @regexp
end

#values ⇒ `Object` (readonly)

Returns the value of attribute values.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def values
  @values
end

Instance Method Details

#build_extraction(extracted) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 38

def build_extraction(extracted)
  return extracted unless @constructor
  @constructor.call(extracted)
end

#build_regexp(regexp, directives, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 43

def build_regexp(regexp, directives, strip)
  stripped = strip_regexp(regexp, strip)
  expanded = expand_regexp(stripped, directives)
  ignore_regexp(expanded, strip)
end

#expand_regexp(regexp, directives) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 70

def expand_regexp(regexp, directives)
  if directives
    expander = Directives.new(regexp)
    expanded = expander.expand
    expander.values.each do |value|
      values[value.id] = @extractor_values.fetch(value.id, value)
    end
    expanded
  else
    regexp
  end
end

#extract_fills(fill) ⇒ `Object`



110
111
112

# File 'lib/text_extractor/record.rb', line 110

def extract_fills(fill)
  @fill.zip(fill.values_at(*@fill)).to_h
end

#extract_values(match) ⇒ `Object`



114
115
116

# File 'lib/text_extractor/record.rb', line 114

def extract_values(match)
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
end

#extraction(match, fill) ⇒ `Object`

Returns Array.

Returns:

Array

# File 'lib/text_extractor/record.rb', line 31

def extraction(match, fill)
  extracted = {}.merge!(@default_values)
                .merge!(extract_fills(fill))
                .merge!(extract_values(match))
  [build_extraction(extracted)]
end

#ignore_regexp(regexp, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 83

def ignore_regexp(regexp, strip)
  return regexp unless strip
  lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
  Regexp.new(lines.join("\n"), regexp.options)
end

#initialize_inline_values(inline_values) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 118

def initialize_inline_values(inline_values)
  inline_values.each do |value|
    @values[value] = @extractor_values
                     .fetch(value) { InlineValue.new(value) }
  end
end

#match(string, pos = 0) ⇒ `Object`



98
99
100

# File 'lib/text_extractor/record.rb', line 98

def match(string, pos = 0)
  @regexp.match(string, pos)
end

#options ⇒ `Object`



106
107
108

# File 'lib/text_extractor/record.rb', line 106

def options
  @regexp.options
end

#regexp_line_ignorer(strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 89

def regexp_line_ignorer(strip)
  case strip
  when :left  then ->(s) { "\[ \\t\\r\\f]*#{s}" }
  when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
  when :both  then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
  else raise "Unknown ignore whitespace option: #{strip}"
  end
end

#regexp_line_stripper(strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 60

def regexp_line_stripper(strip)
  case strip
  when :left  then ->(s) { s.lstrip }
  when :right then ->(s) { s.rstrip }
  when :both  then ->(s) { s.strip }
  when nil, false then ->(s) { s }
  else raise "Unknown strip option: #{strip}"
  end
end

#source ⇒ `Object`



102
103
104

# File 'lib/text_extractor/record.rb', line 102

def source
  @regexp.source
end

#strip_regexp(regexp, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 49

def strip_regexp(regexp, strip)
  lines = regexp.source.split("\n")
  prefix = lines.last
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
    lines.shift
    lines = lines.map { |s| s.gsub(prefix, '') }
    lines = lines.map(&regexp_line_stripper(strip))
  end
  Regexp.new(lines.join("\n"), regexp.options)
end

Class: TextExtractor::Record

Direct Known Subclasses

Defined Under Namespace

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil, **_kwargs) ⇒ Record

Instance Attribute Details

#factory ⇒ Object (readonly)

#regexp ⇒ Object (readonly)

#values ⇒ Object (readonly)

Instance Method Details

#build_extraction(extracted) ⇒ Object

#build_regexp(regexp, directives, strip) ⇒ Object

#expand_regexp(regexp, directives) ⇒ Object

#extract_fills(fill) ⇒ Object

#extract_values(match) ⇒ Object

#extraction(match, fill) ⇒ Object

#ignore_regexp(regexp, strip) ⇒ Object

#initialize_inline_values(inline_values) ⇒ Object

#match(string, pos = 0) ⇒ Object

#options ⇒ Object

#regexp_line_ignorer(strip) ⇒ Object

#regexp_line_stripper(strip) ⇒ Object

#source ⇒ Object

#strip_regexp(regexp, strip) ⇒ Object