Class: Tapsoob::DataStream::Base
- Inherits:
-
Object
- Object
- Tapsoob::DataStream::Base
show all
- Defined in:
- lib/tapsoob/data_stream/base.rb
Constant Summary
collapse
- DEFAULT_CHUNKSIZE =
1000
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
Constructor Details
#initialize(db, state, opts = {}) ⇒ Base
Returns a new instance of Base.
12
13
14
15
16
17
18
19
20
21
22
23
|
# File 'lib/tapsoob/data_stream/base.rb', line 12
def initialize(db, state, opts = {})
@db = db
@state = {
:offset => 0,
:avg_chunksize => 0,
:num_chunksize => 0,
:total_chunksize => 0
}.merge(state)
@state[:chunksize] ||= DEFAULT_CHUNKSIZE
@options = opts
@complete = false
end
|
Instance Attribute Details
#db ⇒ Object
Returns the value of attribute db.
10
11
12
|
# File 'lib/tapsoob/data_stream/base.rb', line 10
def db
@db
end
|
#options ⇒ Object
Returns the value of attribute options.
10
11
12
|
# File 'lib/tapsoob/data_stream/base.rb', line 10
def options
@options
end
|
#state ⇒ Object
Returns the value of attribute state.
10
11
12
|
# File 'lib/tapsoob/data_stream/base.rb', line 10
def state
@state
end
|
Class Method Details
.factory(db, state, opts) ⇒ Object
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
|
# File 'lib/tapsoob/data_stream/base.rb', line 298
def self.factory(db, state, opts)
if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
Sequel::MySQL.convert_invalid_date_time = :nil
end
if state.has_key?(:klass)
return eval(state[:klass]).new(db, state, opts)
end
if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
Tapsoob::DataStream::Keyed.new(db, state, opts)
else
Tapsoob::DataStream::Base.new(db, state, opts)
end
end
|
.parse_json(json) ⇒ Object
215
216
217
218
219
|
# File 'lib/tapsoob/data_stream/base.rb', line 215
def self.parse_json(json)
hash = JSON.parse(json).symbolize_keys
hash[:state].symbolize_keys! if hash.has_key?(:state)
hash
end
|
Instance Method Details
#complete? ⇒ Boolean
179
180
181
182
183
184
185
186
187
188
189
190
191
|
# File 'lib/tapsoob/data_stream/base.rb', line 179
def complete?
if state[:file_initialized]
result = state[:lines_read] >= state[:total_lines]
log.debug "DataStream::Base#complete? (file) lines_read=#{state[:lines_read]} total_lines=#{state[:total_lines]} result=#{result} table=#{table_name}"
result
else
result = state[:offset] >= state[:size]
log.debug "DataStream::Base#complete? (db) offset=#{state[:offset]} size=#{state[:size]} result=#{result} table=#{table_name}"
result
end
end
|
#encode_rows(rows) ⇒ Object
155
156
157
|
# File 'lib/tapsoob/data_stream/base.rb', line 155
def encode_rows(rows)
Tapsoob::Utils.base64encode(Marshal.dump(rows))
end
|
#error ⇒ Object
34
35
36
|
# File 'lib/tapsoob/data_stream/base.rb', line 34
def error
state[:error] || false
end
|
#error=(val) ⇒ Object
30
31
32
|
# File 'lib/tapsoob/data_stream/base.rb', line 30
def error=(val)
state[:error] = val
end
|
#fetch(opts = {}) ⇒ Object
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
|
# File 'lib/tapsoob/data_stream/base.rb', line 159
def fetch(opts = {})
opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
log.debug "DataStream::Base#fetch state -> #{state.inspect}"
t1 = Time.now
rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
encoded_data = encode_rows(rows)
t2 = Time.now
elapsed_time = t2 - t1
if opts[:type] != "file"
state[:offset] += (rows == {} ? 0 : rows[:data].size)
end
[encoded_data, (rows == {} ? 0 : rows[:data].size), elapsed_time]
end
|
#fetch_data_from_database(params) {|rows| ... } ⇒ Object
193
194
195
196
197
198
199
200
201
202
203
|
# File 'lib/tapsoob/data_stream/base.rb', line 193
def fetch_data_from_database(params)
encoded_data = params[:encoded_data]
rows = parse_encoded_data(encoded_data, params[:checksum])
state.merge!(params[:state].merge(:chunksize => state[:chunksize]))
yield rows if block_given?
(rows == {} ? 0 : rows[:data].size)
end
|
#fetch_data_to_database(params) ⇒ Object
205
206
207
208
209
210
211
212
213
|
# File 'lib/tapsoob/data_stream/base.rb', line 205
def fetch_data_to_database(params)
encoded_data = params[:encoded_data]
rows = parse_encoded_data(encoded_data, params[:checksum])
log.debug "DataStream::Base#fetch_data_to_database: importing #{rows[:data] ? rows[:data].size : 0} rows for table #{table_name rescue 'unknown'}"
import_rows(rows)
(rows == {} ? 0 : rows[:data].size)
end
|
#fetch_file(dump_path) ⇒ Object
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
# File 'lib/tapsoob/data_stream/base.rb', line 90
def fetch_file(dump_path)
file_path = File.join(dump_path, "data", "#{table_name}.json")
unless state[:file_initialized]
state[:file_initialized] = true
state[:lines_read] = 0
state[:total_lines] = File.foreach(file_path).count
end
table_name_val = nil
= nil
types_val = nil
data_batch = []
File.open(file_path, 'r') do |file|
state[:lines_read].times { file.gets }
state[:chunksize].times do
break if file.eof?
line = file.gets
next unless line
chunk = JSON.parse(line.strip)
table_name_val ||= chunk["table_name"]
||= chunk["header"]
types_val ||= chunk["types"]
data_batch.concat(chunk["data"]) if chunk["data"]
state[:lines_read] += 1
end
end
data_batch = data_batch.uniq if @options[:"skip-duplicates"]
log.debug "DataStream::Base#fetch_file: read #{data_batch.size} rows from #{state[:lines_read]} lines (total #{state[:total_lines]} lines in file)"
rows = {
:table_name => table_name_val,
:header => ,
:data => data_batch,
:types => types_val
}
update_chunksize_stats
rows
end
|
#fetch_rows ⇒ Object
keep a record of the average chunksize within the first few hundred thousand records, after chunksize goes below 100 or maybe if offset is > 1000
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
# File 'lib/tapsoob/data_stream/base.rb', line 75
def fetch_rows
state[:size] ||= table.count
ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
log.debug "DataStream::Base#fetch_rows SQL -> #{ds.sql}"
rows = Tapsoob::Utils.format_data(db, ds.all,
:string_columns => string_columns,
:schema => db.schema(table_name),
:table => table_name
)
update_chunksize_stats
rows
end
|
#import_rows(rows) ⇒ Object
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
|
# File 'lib/tapsoob/data_stream/base.rb', line 235
def import_rows(rows)
columns = rows[:header]
data = rows[:data]
if table.columns.size != columns.size
existing_columns = table.columns.map(&:to_s)
additional_columns = columns - existing_columns
additional_columns_idxs = additional_columns.map { |c| columns.index(c) }
additional_columns_idxs.reverse.each do |idx|
columns.delete_at(idx)
rows[:types].delete_at(idx)
end
data.each_index { |didx| additional_columns_idxs.reverse.each { |idx| data[didx].delete_at(idx) } }
end
if rows.has_key?(:types) && rows[:types].include?("blob")
blob_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == "blob" }
data.each_index do |idx|
blob_indices.each do |bi|
data[idx][bi] = Sequel::SQL::Blob.new(Tapsoob::Utils.base64decode(data[idx][bi])) unless data[idx][bi].nil?
end
end
end
if rows.has_key?(:types)
%w(date datetime time).each do |type|
if rows[:types].include?(type)
type_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == type }
data.each_index do |idx|
type_indices.each do |ti|
data[idx][ti] = Sequel.send("string_to_#{type}".to_sym, data[idx][ti]) unless data[idx][ti].nil?
end
end
end
end
end
if @options[:"discard-identity"] && rows[:header].include?("id")
columns = rows[:header] - ["id"]
data = data.map { |d| d[1..-1] }
end
table.import(columns, data, :commit_every => 100)
rescue Exception => ex
case ex.message
when /integer out of range/ then
raise Tapsoob::InvalidData, " \\nDetected integer data that exceeds the maximum allowable size for an integer type.\n This generally occurs when importing from SQLite due to the fact that SQLite does\n not enforce maximum values on integer types.\n ERROR\n else raise ex\n end\nend\n", []
|
#increment(row_count) ⇒ Object
69
70
71
|
# File 'lib/tapsoob/data_stream/base.rb', line 69
def increment(row_count)
state[:offset] += row_count
end
|
#log ⇒ Object
25
26
27
28
|
# File 'lib/tapsoob/data_stream/base.rb', line 25
def log
Tapsoob.log.level = Logger::DEBUG if state[:debug]
Tapsoob.log
end
|
#max_chunksize_training ⇒ Object
144
145
146
|
# File 'lib/tapsoob/data_stream/base.rb', line 144
def max_chunksize_training
20
end
|
#order_by(name = nil) ⇒ Object
62
63
64
65
66
67
|
# File 'lib/tapsoob/data_stream/base.rb', line 62
def order_by(name=nil)
@order_by ||= begin
name ||= table_name
Tapsoob::Utils.order_by(db, name)
end
end
|
#parse_encoded_data(encoded_data, checksum) ⇒ Object
221
222
223
224
225
226
227
228
229
230
231
232
233
|
# File 'lib/tapsoob/data_stream/base.rb', line 221
def parse_encoded_data(encoded_data, checksum)
raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
begin
return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
rescue Object => e
unless ENV['NO_DUMP_MARSHAL_ERRORS']
puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
end
raise e
end
end
|
#string_columns ⇒ Object
54
55
56
|
# File 'lib/tapsoob/data_stream/base.rb', line 54
def string_columns
@string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
end
|
#table ⇒ Object
58
59
60
|
# File 'lib/tapsoob/data_stream/base.rb', line 58
def table
@table ||= db[table_name_sql]
end
|
#table_name ⇒ Object
38
39
40
|
# File 'lib/tapsoob/data_stream/base.rb', line 38
def table_name
state[:table_name].to_sym
end
|
#table_name_sql ⇒ Object
42
43
44
|
# File 'lib/tapsoob/data_stream/base.rb', line 42
def table_name_sql
table_name
end
|
#to_hash ⇒ Object
46
47
48
|
# File 'lib/tapsoob/data_stream/base.rb', line 46
def to_hash
state.merge(:klass => self.class.to_s)
end
|
#to_json ⇒ Object
50
51
52
|
# File 'lib/tapsoob/data_stream/base.rb', line 50
def to_json
JSON.generate(to_hash)
end
|
#update_chunksize_stats ⇒ Object
148
149
150
151
152
153
|
# File 'lib/tapsoob/data_stream/base.rb', line 148
def update_chunksize_stats
return if state[:num_chunksize] >= max_chunksize_training
state[:total_chunksize] += state[:chunksize]
state[:num_chunksize] += 1
state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
end
|
#verify_stream ⇒ Object
294
295
296
|
# File 'lib/tapsoob/data_stream/base.rb', line 294
def verify_stream
state[:offset] = table.count
end
|