Class: Zillabyte::Command::Data

Inherits:
Base
  • Object
show all
Defined in:
lib/zillabyte/cli/data.rb

Overview

manage custom datasets

Constant Summary collapse

MAX_POLL_SECONDS =
60 * 5
POLL_SLEEP =
1
APPENDS_ROWS_SLICE =
5_000

Constants inherited from Base

Base::META_COLUMNS

Instance Attribute Summary

Attributes inherited from Base

#args, #options

Instance Method Summary collapse

Methods inherited from Base

#api, #initialize, namespace

Methods included from Helpers

#app, #ask, #command, #create_git_remote, #display, #error, #extract_app_from_git_config, #extract_app_in_dir, #format_with_bang, #friendly_dir, #get_flow_ui_link, #get_info, #get_rich_info, #git, #handle_downloading_manifest, #has_git?, #longest, #read_multiline, #truncate_message, #with_tty

Constructor Details

This class inherits a constructor from Zillabyte::Command::Base

Instance Method Details

#appendObject

data:append ID FILE

Adds data to an existing dataset.

–filetype FILETYPE # Input File format type, defaults to csv –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/zillabyte/cli/data.rb', line 246

def append

  id = options[:id] || shift_argument
  file = options[:file] || shift_argument
  type = options[:output_type]

  filetype = options[:filetype]
  filetype ||= File.extname(file || "").gsub(".", "")
  error("no id given", type) if id.nil?
  error("no file given", type) if file.nil?
  
  dataset = self.api.data.get(id, options)
  columns = dataset["columns"].map{|col| {col["index"] => col["type"]}}
  raw_rows = sanity_check_file(file,filetype,{"columns" => columns}, type)
 
  total_rows = 0 
  display("uploading content.", false)
  raw_rows.each_slice(APPENDS_ROWS_SLICE) do |rows|
    
    
    # TODO: post to direct signed s3 (http://docs.aws.amazon.com/AWSRubySDK/latest/AWS/S3/PresignedPost.html)
    display(".", false)
    require("base64")
    res = self.api.data.append(id, {:gzip_rows => Base64.encode64(gzip(rows.to_json()))})
    # res = self.api.data.append(id, {:rows => rows})
    
    break unless res["size"]
    total_rows += res["size"]
  end
  
  if type == "json"
    display({:rows => total_rows}.to_json)
  else
    display "dataset ##{id} appended #{total_rows} rows"
  end

end

#authorizeObject

data:authorize [ID] [SCOPE]

changes permission on the dataset

–id ID # The dataset id –public # Makes the dataset public (default) –private # Makes the dataset private



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/zillabyte/cli/data.rb', line 183

def authorize

  id = options[:id] || shift_argument
  scope = options[:scope] || shift_argument || "public"
  make_public = options[:public]
  make_private = options[:private]
  
  error("no id given", type) if id.nil?
  error("both --public and --private cannot be given", type) if make_public && make_private
  if make_private
    scope = "private"
  end

  res = self.api.request(
    :expects  => 200,
    :method   => :post,
    :path     => "/relations/#{CGI.escape(id)}/authorizations",
    :body     => {:scope => scope}.to_json
  ).body

  display "Authorization updated"
end

#createObject

data:create NAME

Creates a new dataset.

–schema SCHEMA # Column names and types in the format “field_1:output_type_1,field_2:output_type_2,…” –public SCOPE # Make the dataset public –file FILE # A data file –filetype FILETYPE # File format type, defaults to csv –description DESCRIPTION # Description of dataset contents –aliases ALIASES # Dataset name aliases in the format “alias_1,alias_2,…” –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/zillabyte/cli/data.rb', line 127

def create

  name = options[:name] || shift_argument
  file = options[:file] || nil
  filetype = options[:filetype] || nil
  type = options[:output_type]

  error("no name given", type) if name.nil?

  schema = options[:schema] if options[:schema]
  is_public = options[:public] || false
  description = options[:description] || nil
  aliases = options[:aliases] || nil

  if type.nil?
    hash = get_dataset_properties(schema,is_public,description,aliases)
  else
    hash = hash_dataset_properties(schema,is_public,description,aliases, type)
  end

  if file
    filetype ||= File.extname(file).gsub(".", "")
    rows = sanity_check_file(file,filetype, {"columns" => hash[:schema]}, type)
    hash[:rows] = rows
  end

  res = api.data.create name, hash
  if res['error']
    error("#{res['error_message']}", type)
  else
    if type == "json"
      display "{}"
    else
      display "dataset ##{res['id']} #{res['action']}. size: #{res['size'] || 0} rows."
    end
  end

end

#deleteObject

data:delete ID

Deletes a dataset.

-f, –force # Delete without asking for confirmation –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/zillabyte/cli/data.rb', line 75

def delete 
  id = options[:id] || shift_argument
  forced = options[:force]
  type = options[:output_type] || nil

  if !forced

    if !type.nil?
      error("specify -f, --force to confirm deletion", type)
    end

    while true
      display "This operation cannot be undone. Are you sure you want to delete this dataset? (yes/no):", false
      confirm = ask
      break if confirm == "yes" || confirm == "no"
      display "Please enter 'yes' to delete the dataset or 'no' to exit"
    end
  end

  confirmed = forced || confirm == "yes"
  if confirmed
    res = api.data.delete(id, options)

    if res['error']
      error(res['error'], type)
    else
      if type == "json"
        display "{}"
      else
        display res["body"]
      end
    end
  end
end

#indexObject

data

Lists your custom datasets.

–output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



17
18
19
# File 'lib/zillabyte/cli/data.rb', line 17

def index
  self.list
end

#listObject

data

Lists your custom datasets.

–output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/zillabyte/cli/data.rb', line 29

def list
  type = options[:output_type] || nil

  response = api.request(
    :expects  => 200,
    :method   => :get,
    :path     => "/relations"
  )

  headings = ["id", "name","rows"]
  rows = response.body.map do |row|
    headings = row.keys if headings.size == 0
    row["columns"] = row["columns"].map{|c|c['type']}.join(',')
    row["aliases"] = row["aliases"].map{|a| a['name']}.join(',')

    row = row.keep_if {|col,val| headings.include? col}
    vals = row.map do |col,val| 
      if col == "rows"
        require("zillabyte/cli/helpers/table_output_builder")
        val = TableOutputBuilder.format_row_count(val)
      end
      val
    end
    vals 
  end

  display "datasets\n" if type.nil? && rows.size > 0
  require("zillabyte/cli/helpers/table_output_builder")
  display TableOutputBuilder.build_table(headings, rows, type)
  display "Total number of datasets: "+rows.length.to_s if type.nil?

end

#pullObject

data:pull ID OUTPUT

Pulls dataset into OUTPUT.gz.

–cycle_id [cycle_id] # Retrieve data generated during specified cycle if dataset is associated with an app [default: last cycle] –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/zillabyte/cli/data.rb', line 297

def pull

  id = options[:id] || shift_argument
  file = options[:file] || shift_argument
  type = options[:output_type]
  error("no id given", type) if id.nil?
  error("no file given", type) if file.nil?

  file = "#{file}.gz" unless File.extname(file) == ".gz"

  res = self.api.data.pull(id, options)

  handle_downloading_manifest(file, res, type)

  if type == "json"
    display "{}"
  else
    display "finished pulling dataset ##{id} to file"
  end

end

#pull_to_s3Object

data:pull:s3 ID S3_PATH

Pulls dataset to s3_bucket/s3_key/part***.gz using the given s3_access and s3_secret credentials. S3_PATH may be given in the following forms:

1) s3://s3_access:s3_secret@s3_bucket/s3_key
2) s3://s3_bucket/s3_key: also supply --s3_access and --s3_secret OR set the environment variables S3_ACCESS and S3_SECRET
3) s3_key: also supply --s3_access, --s3_secret and --s3_bucket OR set the environment variables S3_ACCESS and S3_SECRET and supply --s3_bucket

–cycle_id [cycle_id] # Retrieve data generated during specified cycle if dataset is associated with an app [default: last cycle] –s3_access [s3_access_key] # S3 access key –s3_secret [s3_secret_key] # S3 secret key –s3_bucket [s3_bucket] # S3 bucket to store data at –s3_key [s3_file_key] # S3 key to store data at –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# File 'lib/zillabyte/cli/data.rb', line 345

def pull_to_s3

  id = options[:id] || shift_argument
  type = options[:output_type]
  error("no id given", type) if id.nil?

  s3_path = options[:s3_path] || shift_argument
  error("no s3 path given", type) if s3_path.nil?

  matches = s3_path.match(/^s3:\/\/([A-Z0-9]{20}):([$-\/:-?{-~!"^_`\[\]\w]{40})@([\w\.]*)\/([-\w\/]*)$/)
  if !matches.nil?
    s3_access, s3_secret, s3_bucket, s3_key = matches.captures
  else
    matches = s3_path.match(/^s3:\/\/([\w\.]*)\/([-\w\/]*)$/)
    if !matches.nil?
      s3_bucket, s3_key = matches.captures
    else
      s3_key = s3_path
    end
  end
  s3_access = options[:s3_access] || ENV["S3_ACCESS"] if s3_access == "" or s3_access.nil?
  s3_secret = options[:s3_secret] || ENV["S3_SECRET"] if s3_secret == "" or s3_secret.nil?
  s3_bucket = options[:s3_bucket] if s3_bucket == "" or s3_bucket.nil?

  error("No s3 access key or invalid access key provided. Please check that you have entered the access key correctly.", type) if s3_access.nil?
  error("No s3 access secret key or invalid secret key provided. Please check that you have entered the secret key correctly.", type) if s3_secret.nil?
  error("No s3 access bucket or invalid bucket provided. Please check that you have entered the bucket correctly.", type) if s3_bucket.nil?
  error("No s3 file key provided. Please check that you have entered the file key correctly.", type) if s3_key.nil?

  s3_params = {:s3_access_key => s3_access, :s3_secret => s3_secret, 
               :s3_bucket => s3_bucket, :s3_file_key => s3_key}
  s3_params[:cycle_id] = options[:cycle_id] if options[:cycle_id]

  res = self.api.data.pull_to_s3(id, s3_params)

  if type == "json"
    display "{}"
  else
    display "downloading dataset to s3://#{res["s3_bucket"]}/#{res["s3_file_key"]}/"
    display "if the dataset is large, this may take a while, please check your s3 account after a few minutes"
  end
end

#readmeObject

data:readme ID FILE

Attaches a README file to a dataset

#HIDDEN



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/zillabyte/cli/data.rb', line 217

def readme

  id = options[:id] || shift_argument
  file = options[:file] || shift_argument
  error("no id given", type) if id.nil?
  error("no file given", type) if file.nil? 
  error("file doesn't exist") unless File.exists?(file)
  content = File.read(file)

  res = self.api.request(
    :expects  => 200,
    :method   => :post,
    :path     => "/relations/#{CGI.escape(id)}/readme",
    :body     => {:filename => file, :content => content}.to_json
  ).body

  display "README updated"
end

#showObject

data:show ID

Shows a sample of the dataset. See ‘zillabyte queries’ for more elaborate functionality.

–cycle_id [cycle_id] # Retrieve data generated during specified cycle if dataset is associated with an app [default: last cycle] –no_truncation # Don’t truncate long strings –meta # Show metadata columns (since, confidence, source) –output_type OUTPUT_TYPE # Specify an output type i.e. json #HIDDEN



404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
# File 'lib/zillabyte/cli/data.rb', line 404

def show
  name = options[:name] || shift_argument
  type = options[:output_type]
  show_meta = options[:meta] || false
  error "no id given" if name.nil?
  
  # Initial request..
  res = self.api.data.show(name, :post, options)

  if res['job_id']
    job_id = res['job_id']
    options[:job_id] = job_id

    # Poll until the results are ready... 
    start = Time.now.utc

    display "Fetching your data, please wait..." if type.nil?

    while(Time.now.utc < start + MAX_POLL_SECONDS) do

      # Poll
      res = self.api.data.show(name, :get, options)

      # Status?
      case res['status']
      when 'completed'
        if res['return']
          res = res['return']
        else
          throw "something is wrong: #{res}"
        end
        # success! continue below
        break
      when 'running'
        sleep(POLL_SLEEP)
     #   display ".", false
      else
        throw "unknown status: #{res}"
      end
      
    end
  else
    if res['error']
      error(res['error_message'] || res['error'], type)
    else
      error("remote server error (r256)", type)
    end
  end 

  # We only reach here after polling is complete...
  if res["rows"] && res["rows"].size > 0
    headings = []
    concrete_headings = res["rows"].first.keys
    concrete_headings.delete("id")
    META_COLUMNS.each {|c| concrete_headings.delete c} if (!show_meta)
    concrete_headings.each do |ch|
      has_alias = false
      (res['column_aliases'] || []).each do |al|
        if(al["concrete_name"] == ch)
          headings << al["alias"]
          has_alias = true
        end
      end
      headings << ch if !has_alias
    end

    rows = []
    res["rows"].each do |obj|
      new_row = concrete_headings.map do |heading|
        if options[:no_truncation]
          obj[heading]
        else
          if obj[heading].to_s.size > 30 && type != "json"
            obj[heading].to_s[0..30] + "..."
          else
            obj[heading]
          end
        end
      end
      rows << new_row
    end
    require("colorize")
    require("zillabyte/cli/helpers/table_output_builder")
    if type.nil?
      display "Sampled output:"
      display TableOutputBuilder.build_terminal_table(headings, rows)
      display "To download your full dataset, type "+"`zillabyte data:pull [RELATION_NAME] [OUTPUT_PREFIX] [DIRECTORY]`".colorize(:green)+"."
    else
      display TableOutputBuilder.build_table(headings, rows, type)
    end
  else
    if type == "json"
      display "{}"
    else
      display "empty dataset"
    end
  end
  
end