Class: Remi::DataSource::CsvFile

Inherits:
Remi::DataSource show all
Includes:
Remi::DataSubject::CsvFile
Defined in:
lib/remi/data_subject/csv_file.rb

Instance Attribute Summary collapse

Attributes inherited from Remi::DataSubject

#fields

Instance Method Summary collapse

Methods included from Remi::DataSubject::CsvFile

#field_symbolizer, included

Methods inherited from Remi::DataSource

#df, #extract

Methods included from Testing::DataStub

#empty_stub_df, #stub_boolean, #stub_date, #stub_datetime, #stub_decimal, #stub_df, #stub_float, #stub_integer, #stub_json, #stub_row_array, #stub_string, #stub_values

Methods inherited from Remi::DataSubject

#df, #df=, #enforce_types, #field_symbolizer

Constructor Details

#initialize(*args, **kargs, &block) ⇒ CsvFile

Returns a new instance of CsvFile.



32
33
34
35
# File 'lib/remi/data_subject/csv_file.rb', line 32

def initialize(*args, **kargs, &block)
  super
  init_csv_file(*args, **kargs, &block)
end

Instance Attribute Details

#csv_optionsObject (readonly)

Returns the value of attribute csv_options.



38
39
40
# File 'lib/remi/data_subject/csv_file.rb', line 38

def csv_options
  @csv_options
end

#extractorObject

Returns the value of attribute extractor.



37
38
39
# File 'lib/remi/data_subject/csv_file.rb', line 37

def extractor
  @extractor
end

Instance Method Details

#extract!Object

Public: Called to extract data from the source.

Returns data in a format that can be used to create a dataframe.



43
44
45
# File 'lib/remi/data_subject/csv_file.rb', line 43

def extract!
  @extract = Array(@extractor.extract)
end

#first_lineObject



84
85
86
87
88
89
# File 'lib/remi/data_subject/csv_file.rb', line 84

def first_line
  # Readline assumes \n line endings.  Strip out \r if it is a DOS file.
  @first_line ||= File.open(source_filename) do |f|
    f.readline.gsub(/\r/,'')
  end
end

#headersObject



91
92
93
# File 'lib/remi/data_subject/csv_file.rb', line 91

def headers
  @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
end

#source_filenameObject

Only going to support single file for now



79
80
81
82
# File 'lib/remi/data_subject/csv_file.rb', line 79

def source_filename
  raise "Multiple source files detected" if extract.size > 1
  @source_filename ||= extract.first
end

#to_dataframeObject

Public: Converts extracted data to a dataframe. Currently only supports Daru DataFrames.

Returns a Remi::DataFrame



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/remi/data_subject/csv_file.rb', line 51

def to_dataframe
  # Assumes that each file has exactly the same structure
  result_df = nil
  extract.each_with_index do |filename, idx|
    filename = filename.to_s

    @logger.info "Converting #{filename} to a dataframe"
    processed_filename = preprocess(filename)
    csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options

    csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
    if idx == 0
      result_df = csv_df
    else
      result_df = result_df.concat csv_df
    end
  end

  Remi::DataFrame.create(:daru, result_df)
end

#valid_headers?Boolean

Returns:

  • (Boolean)


95
96
97
# File 'lib/remi/data_subject/csv_file.rb', line 95

def valid_headers?
  (fields.keys - headers).empty?
end