Module: DarwinCore::Ingester

Included in:
Core, Extension
Defined in:
lib/dwc_archive/ingester.rb

Overview

This module abstracts information for reading csv file to be used in several classes which need such functionality

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def data
  @data
end

#encodingObject (readonly)

Returns the value of attribute encoding.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def encoding
  @encoding
end

#fieldsObject (readonly)

Returns the value of attribute fields.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def fields
  @fields
end

#fields_separatorObject (readonly)

Returns the value of attribute fields_separator.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def fields_separator
  @fields_separator
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def file_path
  @file_path
end

#ignore_headersObject (readonly)

Returns the value of attribute ignore_headers.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def ignore_headers
  @ignore_headers
end

#line_separatorObject (readonly)

Returns the value of attribute line_separator.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def line_separator
  @line_separator
end

#propertiesObject (readonly)

Returns the value of attribute properties.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def properties
  @properties
end

#quote_characterObject (readonly)

Returns the value of attribute quote_character.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def quote_character
  @quote_character
end

#sizeObject (readonly)

Returns the value of attribute size.



5
6
7
# File 'lib/dwc_archive/ingester.rb', line 5

def size
  @size
end

Instance Method Details

#read(batch_size = 10_000) {|[res, errors]| ... } ⇒ Object

Yields:

  • ([res, errors])


12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/dwc_archive/ingester.rb', line 12

def read(batch_size = 10_000)
  DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
  res = []
  errors = []
  args = define_csv_args
  min_size = @fields.map { |f| f[:index].to_i || 0 }.max + 1
  csv = CSV.new(open(@file_path), **args)
  csv.each_with_index do |r, i|
    next if @ignore_headers && i == 0

    min_size > r.size ? errors << r : process_csv_row(res, errors, r)
    next if i == 0 || i % batch_size != 0

    DarwinCore.logger_write(@dwc.object_id,
                            format("Ingested %s records from %s",
                                   i, name))
    next unless block_given?

    yield [res, errors]
    res = []
    errors = []
  end
  yield [res, errors] if block_given?
  [res, errors]
end