Class: Dataflow::Nodes::Export::ToCsvNode

Inherits:
ComputeNode
  • Object
show all
Defined in:
lib/dataflow/nodes/export/to_csv_node.rb

Overview

Export a dataset to CSV

Constant Summary

Constants included from SchemaMixin

SchemaMixin::SAMPLE_DATA_OUTPUT, SchemaMixin::SEPARATOR

Instance Method Summary collapse

Methods inherited from ComputeNode

#all_dependencies, #compute, #data_node, data_node_opts, #dependencies, dependency_opts, ensure_data_node_exists, ensure_dependencies, #explain_update, #force_computing_lock_release!, #locked_for_computing?, #needs_automatic_recomputing?, #recompute, #schema, #set_defaults, #updated?, #updated_at, #updated_at=, #valid_for_computation?

Methods included from SchemaMixin

#infer_partial_schema, #infer_schema, #sample_data, #schema_inferrer

Methods included from Dataflow::Node

find, #recompute, #required_by, #valid_for_computation?, #validate!

Instance Method Details

#compute_implObject



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/dataflow/nodes/export/to_csv_node.rb', line 15

def compute_impl
  node = dependencies.first
  where = JSON.parse(query)

  # fetch the schema
  sch = if keys.present?
          keys.map { |k| [k, { type: 'string' }] }.to_h
        else
          node.infer_partial_schema(where: where, extended: true)
        end

  # create the dataset
  csv_adapter = Adapters::CsvAdapter.new(data_node: node)
  csv_adapter.set_schema(sch)
  csv_adapter.recreate_dataset

  # export in parallel
  max_per_process = 1000
  max_per_process = limit_per_process if limit_per_process < 0

  data_count = [node.count(where: where), 1].max
  equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
  count_per_process = [max_per_process, equal_split_per_process].min

  queries = node.ordered_system_id_queries(batch_size: count_per_process)

  parallel_each(queries.each_with_index) do |query, _idx|
    # TODO: re-enabled event on_export_progressed
    # progress = (idx / queries.count.to_f * 100).ceil
    # on_export_progressed(pct_complete: progress)
    batch = node.all(where: query.merge(where), fields: sch.keys)
    csv_adapter.save(records: batch)
  end

  # needed by the csv exporter to finalize in a single file
  csv_adapter.on_save_finished
end