Class: RemoteTable

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/remote_table.rb,
lib/remote_table/ods.rb,
lib/remote_table/shp.rb,
lib/remote_table/xls.rb,
lib/remote_table/xml.rb,
lib/remote_table/html.rb,
lib/remote_table/xlsx.rb,
lib/remote_table/yaml.rb,
lib/remote_table/version.rb,
lib/remote_table/delimited.rb,
lib/remote_table/plaintext.rb,
lib/remote_table/local_copy.rb,
lib/remote_table/fixed_width.rb,
lib/remote_table/transformer.rb,
lib/remote_table/processed_by_roo.rb,
lib/remote_table/processed_by_nokogiri.rb

Overview

Open Google Docs spreadsheets, local or remote XLSX, XLS, ODS, CSV (comma separated), TSV (tab separated), other delimited, fixed-width files.

Defined Under Namespace

Modules: Delimited, FixedWidth, Html, Ods, Plaintext, ProcessedByNokogiri, ProcessedByRoo, Shp, Xls, Xlsx, Xml, Yaml Classes: Transformer

Constant Summary collapse

EXTERNAL_ENCODING =
'UTF-8'
EXTERNAL_ENCODING_ICONV =
'UTF-8//TRANSLIT'
GOOGLE_DOCS_SPREADSHEET =
[
  /docs.google.com/i,
  /spreadsheets.google.com/i
]
VALID =
{
  :compression => [:gz, :zip, :bz2, :exe],
  :packing => [:tar],
  :format => [:xlsx, :xls, :delimited, :ods, :fixed_width, :html, :xml, :yaml, :csv, :shp],
}
DEFAULT =
{
  :streaming => false,
  :warn_on_multiple_downloads => true,
  :headers => :first_row,
  :keep_blank_rows => false,
  :skip => 0,
  :internal_encoding => 'UTF-8',
  :delimiter => ','
}
OLD_SETTING_NAMES =
{
  :internal_encoding => [:encoding],
  :transform_settings => [:transform],
  :pre_select => [:select],
  :pre_reject => [:reject],
  :errata_settings => [:errata],
}
VERSION =
"2.1.0"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(settings) ⇒ RemoteTable #initialize(url, settings) ⇒ RemoteTable

Create a new RemoteTable, which is an Enumerable.

Does not immediately download/parse… it’s lazy-loading.

Examples:

Open an XLSX

RemoteTable.new('http://www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx')

Open a CSV inside a ZIP file

RemoteTable.new 'http://www.epa.gov/climatechange/emissions/downloads10/2010-Inventory-Annex-Tables.zip',
                :filename => 'Annex Tables/Annex 3/Table A-93.csv',
                :skip => 1,
                :pre_select => proc { |row| row['Vehicle Age'].strip =~ /^\d+$/ }

Overloads:

  • #initialize(settings) ⇒ RemoteTable

    Parameters:

    • settings (Hash)

      Settings including :url.

  • #initialize(url, settings) ⇒ RemoteTable

    Parameters:

    • url (String)

      The URL to the local or remote file.

    • settings (Hash)

      Settings.



352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# File 'lib/remote_table.rb', line 352

def initialize(*args)
  @download_count_mutex = ::Mutex.new
  @extend_bang_mutex = ::Mutex.new
  @errata_mutex = ::Mutex.new

  @cache = []
  @download_count = 0

  settings = args.last.is_a?(::Hash) ? args.last.symbolize_keys : {}

  @url = if args.first.is_a? ::String
    args.first
  else
    grab settings, :url
  end
  @format = RemoteTable.guess_format grab(settings, :format)
  if GOOGLE_DOCS_SPREADSHEET.any? { |regex| regex =~ url }
    @url = RemoteTable.google_spreadsheet_csv_url url
    @format = :delimited
  end

  @headers = grab settings, :headers
  if headers.is_a?(::Array) and headers.any?(&:blank?)
    raise ::ArgumentError, "[remote_table] If you specify headers, none of them can be blank"
  end

  @compression = grab(settings, :compression) || RemoteTable.guess_compression(url)
  @packing = grab(settings, :packing) || RemoteTable.guess_packing(url)

  @streaming = grab settings, :streaming
  @warn_on_multiple_downloads = grab settings, :warn_on_multiple_downloads
  @delimiter = grab settings, :delimiter
  @sheet = grab settings, :sheet
  @keep_blank_rows = grab settings, :keep_blank_rows
  @form_data = grab settings, :form_data
  @skip = grab settings, :skip
  @internal_encoding = grab settings, :internal_encoding
  @row_xpath = grab settings, :row_xpath
  @column_xpath = grab settings, :column_xpath
  @row_css = grab settings, :row_css
  @column_css = grab settings, :column_css
  @glob = grab settings, :glob
  @filename = grab settings, :filename
  @transform_settings = grab settings, :transform_settings
  @cut = grab settings, :cut
  @crop = grab settings, :crop
  @schema = grab settings, :schema
  @schema_name = grab settings, :schema_name
  @pre_select = grab settings, :pre_select
  @pre_reject = grab settings, :pre_reject
  @errata_settings = grab settings, :errata_settings

  @other_options = settings
  
  @transformer = Transformer.new self
  @local_copy = LocalCopy.new self
end

Instance Attribute Details

#column_cssString (readonly)

The CSS selector used to find columns in HTML or XML.

Returns:

  • (String)


231
232
233
# File 'lib/remote_table.rb', line 231

def column_css
  @column_css
end

#column_xpathString (readonly)

The XPath used to find columns in HTML or XML.

Returns:

  • (String)


223
224
225
# File 'lib/remote_table.rb', line 223

def column_xpath
  @column_xpath
end

#compressionSymbol (readonly)

The compression type. Guessed from URL if not provided. :gz, :zip, :bz2, and :exe (treated as :zip) are supported.

Returns:

  • (Symbol)


239
240
241
# File 'lib/remote_table.rb', line 239

def compression
  @compression
end

#cropRange (readonly)

Use a range of rows in a plaintext file.

Examples:

Only take rows 21 through 37

RemoteTable.new("http://www.eia.gov/emeu/cbecs/cbecs2003/detailed_tables_2003/2003set10/2003excel/C17.xls",
                :headers => false,
                :select => proc { |row| CbecsEnergyIntensity::NAICS_CODE_SYNTHESIZER.call(row) },
                :crop => (21..37))

Returns:

  • (Range)


280
281
282
# File 'lib/remote_table.rb', line 280

def crop
  @crop
end

#cutString (readonly)

Pick specific columns out of a plaintext file using an argument to the UNIX [cut utility](en.wikipedia.org/wiki/Cut_%28Unix%29).

Examples:

Pick ALMOST out of ABCDEFGHIJKLMNOPQRSTUVWXYZ

# $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZ | cut -c '1,12,13,15,19,20'
# ALMOST
RemoteTable.new 'file:///atoz.txt', :cut => '1,12,13,15,19,20'

Returns:

  • (String)


269
270
271
# File 'lib/remote_table.rb', line 269

def cut
  @cut
end

#delimiterString (readonly)

The delimiter, a.k.a. column separator. Passed to Ruby CSV as :col_sep. Default is :delimited.

Returns:

  • (String)


215
216
217
# File 'lib/remote_table.rb', line 215

def delimiter
  @delimiter
end

#errata_settingsHash (readonly)

A hash of settings to initialize an Errata instance to be used on every row. Previously passed as :errata.

See the Errata library at github.com/seamusabshere/errata

Returns:



318
319
320
# File 'lib/remote_table.rb', line 318

def errata_settings
  @errata_settings
end

#filenameString (readonly)

The filename, which can be used to pick a file out of an archive.

Examples:

Specify the filename to get out of a ZIP file

RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'

Returns:

  • (String)


259
260
261
# File 'lib/remote_table.rb', line 259

def filename
  @filename
end

#form_dataString (readonly)

Form data to POST in the download request. It should probably be in application/x-www-form-urlencoded.

Returns:

  • (String)


203
204
205
# File 'lib/remote_table.rb', line 203

def form_data
  @form_data
end

#formatHash (readonly)

The format of the source file. Can be specified as: :xlsx, :xls, :delimited (aka :csv), :ods, :fixed_width, :html, :xml, :yaml

Note: treats all docs.google.com and spreadsheets.google.com URLs as :delimited.

Default: guessed from file extension (which is usually the same as the URL, but sometimes not if you pick out a specific file from an archive)

Returns:



235
236
237
# File 'lib/remote_table.rb', line 235

def format
  @format
end

#globString (readonly)

The glob used to pick a file out of an archive.

Examples:

Pick out the only CSV in a ZIP file

RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'

Returns:

  • (String)


251
252
253
# File 'lib/remote_table.rb', line 251

def glob
  @glob
end

#headers:first_row, ... (readonly)

Headers specified by the user: :first_row (the default), false, or a list of headers.

Returns:

  • (:first_row, false, Array<String>)


191
192
193
# File 'lib/remote_table.rb', line 191

def headers
  @headers
end

#internal_encodingString (readonly)

The original encoding of the source file. Default is UTF-8. Previously passed as :encoding.

Returns:

  • (String)


211
212
213
# File 'lib/remote_table.rb', line 211

def internal_encoding
  @internal_encoding
end

#keep_blank_rowstrue, false (readonly)

Whether to keep blank rows. Default is false.

Returns:

  • (true, false)


199
200
201
# File 'lib/remote_table.rb', line 199

def keep_blank_rows
  @keep_blank_rows
end

#other_optionsHash (readonly)

Options passed by the user that may be passed through to the underlying parsing library.

Returns:



331
332
333
# File 'lib/remote_table.rb', line 331

def other_options
  @other_options
end

#packingSymbol (readonly)

The packing type. Guessed from URL if not provided. Only :tar is supported.

Returns:

  • (Symbol)


243
244
245
# File 'lib/remote_table.rb', line 243

def packing
  @packing
end

#pre_rejectProc (readonly)

A proc that decides whether to include a row. Previously passed as :reject.

Returns:

  • (Proc)


307
308
309
# File 'lib/remote_table.rb', line 307

def pre_reject
  @pre_reject
end

#pre_selectProc (readonly)

A proc that decides whether to include a row. Previously passed as :select.

Returns:

  • (Proc)


303
304
305
# File 'lib/remote_table.rb', line 303

def pre_select
  @pre_select
end

#row_cssString (readonly)

The CSS selector used to find rows in HTML or XML.

Returns:

  • (String)


227
228
229
# File 'lib/remote_table.rb', line 227

def row_css
  @row_css
end

#row_xpathString (readonly)

The XPath used to find rows in HTML or XML.

Returns:

  • (String)


219
220
221
# File 'lib/remote_table.rb', line 219

def row_xpath
  @row_xpath
end

#schemaArray<Array{String,Integer,Hash}> (readonly)

The fixed-width schema, given as a multi-dimensional array.

Examples:

From the tests

RemoteTable.new('http://cloud.github.com/downloads/seamusabshere/remote_table/test2.fixed_width.txt',
                 :format => :fixed_width,
                 :skip => 1,
                 :schema => [[ 'header4', 10, { :type => :string }  ],
                             [  'spacer',  1 ],
                             [  'header5', 10, { :type => :string } ],
                             [  'spacer',  12 ],
                             [  'header6', 10, { :type => :string } ]])

Returns:



295
296
297
# File 'lib/remote_table.rb', line 295

def schema
  @schema
end

#schema_nameString, Symbol (readonly)

If you somehow already defined a fixed-width schema (so you can re-use it?), specify it here.

Returns:

  • (String, Symbol)


299
300
301
# File 'lib/remote_table.rb', line 299

def schema_name
  @schema_name
end

#sheetObject (readonly)

The sheet specified by the user as a number or a string. @return



195
196
197
# File 'lib/remote_table.rb', line 195

def sheet
  @sheet
end

#skipInteger (readonly)

How many rows to skip at the beginning of the file or table. Default is 0.

Returns:

  • (Integer)


207
208
209
# File 'lib/remote_table.rb', line 207

def skip
  @skip
end

#streamingtrue, false (readonly)

Whether to stream the rows without caching them. Saves memory, but you have to re-download the file every time you enumerate its rows. Defaults to false.

Returns:

  • (true, false)


183
184
185
# File 'lib/remote_table.rb', line 183

def streaming
  @streaming
end

#transform_settingsHash (readonly)

Settings to create a transformer.

Returns:



311
312
313
# File 'lib/remote_table.rb', line 311

def transform_settings
  @transform_settings
end

#urlString (readonly)

The URL of the local or remote file.

Examples:

Local

file:///Users/myuser/Desktop/holidays.csv

Local using an absolute path

/Users/myuser/Desktop/holidays.csv

Remote

http://data.brighterplanet.com/countries.csv

Returns:

  • (String)


160
161
162
# File 'lib/remote_table.rb', line 160

def url
  @url
end

#warn_on_multiple_downloadstrue, false (readonly)

Whether to warn the user on multiple downloads. Defaults to true.

Returns:

  • (true, false)


187
188
189
# File 'lib/remote_table.rb', line 187

def warn_on_multiple_downloads
  @warn_on_multiple_downloads
end

Class Method Details

.google_spreadsheet_csv_url(url) ⇒ String

Given a Google Docs spreadsheet URL, make sure it uses CSV output.

Returns:

  • (String)


100
101
102
103
104
105
106
107
# File 'lib/remote_table.rb', line 100

def google_spreadsheet_csv_url(url)
  uri = ::URI.parse url
  params = uri.query.split('&')
  params.delete_if { |param| param.start_with?('output=') }
  params << 'output=csv'
  uri.query = params.join('&')
  uri.to_s
end

.guess_compression(url) ⇒ Symbol?

Guess compression based on URL. Used internally.

Returns:

  • (Symbol, nil)


49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/remote_table.rb', line 49

def guess_compression(url)
  extname = ::File.extname(::URI.parse(url).path).downcase
  case extname
  when /gz/, /gunzip/
    :gz
  when /zip/
    :zip
  when /bz2/, /bunzip2/
    :bz2
  when /exe/
    :exe
  end
end

.guess_format(basename) ⇒ Symbol?

Guess file format from the basename. Since a file might be decompressed and/or pulled out of an archive with a glob, this usually can’t be called until a file is downloaded.

Returns:

  • (Symbol, nil)


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/remote_table.rb', line 74

def guess_format(basename)
  case basename.to_s.downcase
  when /ods/, /open_?office/
    :ods
  when /xlsx/, /excelx/
    :xlsx
  when /xls/, /excel/
    :xls
  when /csv/, /tsv/, /delimited/
    # note that there is no RemoteTable::Csv class - it's normalized to :delimited
    :delimited
  when /fixed_?width/
    :fixed_width
  when /htm/
    :html
  when /xml/
    :xml
  when /yaml/, /yml/
    :yaml
  when /shp/
    :shp
  end
end

.guess_packing(url) ⇒ Symbol?

Guess packing from URL. Used internally.

Returns:

  • (Symbol, nil)


65
66
67
68
69
70
# File 'lib/remote_table.rb', line 65

def guess_packing(url)
  basename = ::File.basename(::URI.parse(url).path).downcase
  if basename.include?('.tar') or basename.include?('.tgz')
    :tar
  end
end

Instance Method Details

#[](row_number) ⇒ Hash, Array

Get a row by row number. Zero-based.

Returns:



464
465
466
467
468
469
470
# File 'lib/remote_table.rb', line 464

def [](row_number)
  if fully_cached?
    cache[row_number]
  else
    to_a[row_number]
  end
end

#each {|Hash, Array| ... } ⇒ nil Also known as: each_row

Yield each row.

Yields:

  • (Hash, Array)

    A hash or an array depending on whether the RemoteTable has named headers (column names).

Returns:

  • (nil)


415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
# File 'lib/remote_table.rb', line 415

def each
  extend!
  if fully_cached?
    cache.each do |row|
      yield row
    end
  else
    mark_download!
    memo = _each do |row|
      transformer.transform(row).each do |virtual_row|
        virtual_row.row_hash = ::HashDigest.hexdigest row
        if errata
          next if errata.rejects? virtual_row
          errata.correct! virtual_row
        end
        next if pre_select and !pre_select.call(virtual_row)
        next if pre_reject and pre_reject.call(virtual_row)
        unless streaming
          cache.push virtual_row
        end
        yield virtual_row
      end
    end
    unless streaming
      fully_cached!
    end
    memo
  end
  nil
end

#freenil

Clear the row cache in case it helps your GC.

Returns:

  • (nil)


475
476
477
478
479
480
# File 'lib/remote_table.rb', line 475

def free
  @fully_cached = false
  @errata = nil
  cache.clear
  nil
end

#to_aArray<Hash,Array> Also known as: rows

Returns All rows.

Returns:



450
451
452
453
454
455
456
# File 'lib/remote_table.rb', line 450

def to_a
  if fully_cached?
    cache.dup
  else
    map { |row| row }
  end
end