Class: TextExtractor::Record

Inherits:

Object

Object
TextExtractor::Record

show all

Defined in:: lib/text_extractor/record.rb

Direct Known Subclasses

Filldown

Defined Under Namespace

Classes: FactoryAnalyzer

Instance Attribute Summary collapse

#factory ⇒ Object readonly

Returns the value of attribute factory.
#regexp ⇒ Object readonly

Returns the value of attribute regexp.
#values ⇒ Object readonly

Returns the value of attribute values.

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil) ⇒ `Record`

Returns a new instance of Record.

# File 'lib/text_extractor/record.rb', line 7

def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
               inline: [], extractor_values: {}, strip: nil)
  @factory = factory
  @constructor = FactoryAnalyzer.new(factory).to_proc
  @extractor_values = extractor_values
  @values = values.map { |val| [val.id, val] }.to_h
  initialize_inline_values(inline)
  @default_values = values.map { |val| [val.id, nil] }.to_h
  @regexp = build_regexp(regexp, directives, strip)
  @fill = Array(fill)
end

Instance Attribute Details

#factory ⇒ `Object` (readonly)

Returns the value of attribute factory.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def factory
  @factory
end

#regexp ⇒ `Object` (readonly)

Returns the value of attribute regexp.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def regexp
  @regexp
end

#values ⇒ `Object` (readonly)

Returns the value of attribute values.



5
6
7

# File 'lib/text_extractor/record.rb', line 5

def values
  @values
end

Instance Method Details

#build_extraction(extracted) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 26

def build_extraction(extracted)
  return extracted unless @constructor
  @constructor.call(extracted)
end

#build_regexp(regexp, directives, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 31

def build_regexp(regexp, directives, strip)
  stripped = strip_regexp(regexp, strip)
  expanded = expand_regexp(stripped, directives)
  ignore_regexp(expanded, strip)
end

#expand_regexp(regexp, directives) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 58

def expand_regexp(regexp, directives)
  if directives
    expander = Directives.new(regexp)
    expanded = expander.expand
    expander.values.each { |value|
      values[value.id] = @extractor_values.fetch(value.id, value)
    }
    expanded
  else
    regexp
  end
end

#extract_fills(fill) ⇒ `Object`



98
99
100

# File 'lib/text_extractor/record.rb', line 98

def extract_fills(fill)
  @fill.zip(fill.values_at(*@fill)).to_h
end

#extract_values(match) ⇒ `Object`



102
103
104

# File 'lib/text_extractor/record.rb', line 102

def extract_values(match)
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
end

#extraction(match, fill) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 19

def extraction(match, fill)
  extracted = {}.merge!(@default_values)
                .merge!(extract_fills fill)
                .merge!(extract_values match)
  build_extraction(extracted)
end

#ignore_regexp(regexp, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 71

def ignore_regexp(regexp, strip)
  return regexp unless strip
  lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
  Regexp.new(lines.join("\n"), regexp.options)
end

#initialize_inline_values(inline_values) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 106

def initialize_inline_values(inline_values)
  inline_values.each do |value|
    @values[value] = @extractor_values
                     .fetch(value) { InlineValue.new(value) }
  end
end

#match(string, pos = 0) ⇒ `Object`



86
87
88

# File 'lib/text_extractor/record.rb', line 86

def match(string, pos = 0)
  @regexp.match(string, pos)
end

#options ⇒ `Object`



94
95
96

# File 'lib/text_extractor/record.rb', line 94

def options
  @regexp.options
end

#regexp_line_ignorer(strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 77

def regexp_line_ignorer(strip)
  case strip
  when :left  then ->(s) { "\[ \\t\\r\\f]*#{s}" }
  when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
  when :both  then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
  else raise "Unknown ignore whitespace option: #{strip}"
  end
end

#regexp_line_stripper(strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 48

def regexp_line_stripper(strip)
  case strip
  when :left  then ->(s) { s.lstrip }
  when :right then ->(s) { s.rstrip }
  when :both  then ->(s) { s.strip }
  when nil, false then ->(s) { s }
  else raise "Unknown strip option: #{strip}"
  end
end

#source ⇒ `Object`



90
91
92

# File 'lib/text_extractor/record.rb', line 90

def source
  @regexp.source
end

#strip_regexp(regexp, strip) ⇒ `Object`

# File 'lib/text_extractor/record.rb', line 37

def strip_regexp(regexp, strip)
  lines = regexp.source.split("\n")
  prefix = lines.last
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
    lines.shift
    lines = lines.map { |s| s.gsub(prefix, '') }
    lines = lines.map(&regexp_line_stripper(strip))
  end
  Regexp.new(lines.join("\n"), regexp.options)
end

Class: TextExtractor::Record

Direct Known Subclasses

Defined Under Namespace

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil) ⇒ Record

Instance Attribute Details

#factory ⇒ Object (readonly)

#regexp ⇒ Object (readonly)

#values ⇒ Object (readonly)

Instance Method Details

#build_extraction(extracted) ⇒ Object

#build_regexp(regexp, directives, strip) ⇒ Object

#expand_regexp(regexp, directives) ⇒ Object

#extract_fills(fill) ⇒ Object

#extract_values(match) ⇒ Object

#extraction(match, fill) ⇒ Object

#ignore_regexp(regexp, strip) ⇒ Object

#initialize_inline_values(inline_values) ⇒ Object

#match(string, pos = 0) ⇒ Object

#options ⇒ Object

#regexp_line_ignorer(strip) ⇒ Object

#regexp_line_stripper(strip) ⇒ Object

#source ⇒ Object

#strip_regexp(regexp, strip) ⇒ Object

#initialize(regexp, factory: nil, values: [], fill: [], directives: true, inline: [], extractor_values: {}, strip: nil) ⇒ `Record`

#factory ⇒ `Object` (readonly)

#regexp ⇒ `Object` (readonly)

#values ⇒ `Object` (readonly)

#build_extraction(extracted) ⇒ `Object`

#build_regexp(regexp, directives, strip) ⇒ `Object`

#expand_regexp(regexp, directives) ⇒ `Object`

#extract_fills(fill) ⇒ `Object`

#extract_values(match) ⇒ `Object`

#extraction(match, fill) ⇒ `Object`

#ignore_regexp(regexp, strip) ⇒ `Object`

#initialize_inline_values(inline_values) ⇒ `Object`

#match(string, pos = 0) ⇒ `Object`

#options ⇒ `Object`

#regexp_line_ignorer(strip) ⇒ `Object`

#regexp_line_stripper(strip) ⇒ `Object`

#source ⇒ `Object`

#strip_regexp(regexp, strip) ⇒ `Object`