Module: ActiveSanitization

Defined in:
lib/tasks/rake_tasks.rb,
lib/active_sanitization.rb,
lib/active_sanitization/version.rb

Defined Under Namespace

Classes: Configuration, RakeTasks, TempDatabaseConnection

Constant Summary collapse

VERSION =
"0.2.0"

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.configurationObject

Returns the value of attribute configuration.



9
10
11
# File 'lib/active_sanitization.rb', line 9

def configuration
  @configuration
end

Class Method Details

.clean_up_files(dump_file, compressed_dump_file) ⇒ Object



200
201
202
203
204
205
# File 'lib/active_sanitization.rb', line 200

def self.clean_up_files(dump_file, compressed_dump_file)
  self.log("Deleting #{dump_file}")
  File.delete(dump_file) if File.exist?(dump_file)
  self.log("Deleting #{compressed_dump_file}")
  File.delete(compressed_dump_file) if File.exist?(compressed_dump_file)
end

.clean_up_temp_db(temp_db) ⇒ Object



164
165
166
167
# File 'lib/active_sanitization.rb', line 164

def self.clean_up_temp_db(temp_db)
  self.log("Dropping #{temp_db}")
  self.configuration.active_record_connection.execute("DROP DATABASE #{temp_db};")
end

.configure {|configuration| ... } ⇒ Object

Yields:



12
13
14
15
# File 'lib/active_sanitization.rb', line 12

def self.configure
  self.configuration ||= Configuration.new
  yield(configuration)
end

.create_filesObject



142
143
144
145
146
147
148
# File 'lib/active_sanitization.rb', line 142

def self.create_files
  dump_file = "#{File.join(self.configuration.root, "tmp")}/data.dump"
  compressed_dump_file = "#{dump_file}.gz"
  File.new(dump_file,  "w+")
  File.new(compressed_dump_file,  "w+")
  [dump_file, compressed_dump_file]
end

.duplicate_databaseObject



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/active_sanitization.rb', line 95

def self.duplicate_database
  temp_db = "#{self.configuration.db_config['database']}_copy"

  self.log("Deleting temp DB if exists")
  self.configuration.active_record_connection.execute("DROP DATABASE IF EXISTS #{temp_db};")
  self.log("Creating temp DB")
  self.configuration.active_record_connection.execute("CREATE DATABASE #{temp_db}")
  self.log("Copying #{self.configuration.env} DB to temp DB")
  self.log("mysqldump -h #{self.configuration.db_config['host']} -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} #{self.configuration.db_config['database']} #{self.configuration.tables_to_sanitize.keys.join(' ')} | mysql -h #{self.configuration.db_config['host']}  -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} -D #{temp_db}")
  system("mysqldump -h #{self.configuration.db_config['host']} -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} #{self.configuration.db_config['database']} #{self.configuration.tables_to_sanitize.keys.join(' ')} | mysql -h #{self.configuration.db_config['host']}  -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} -D #{temp_db}")
  if $?.exitstatus == 0
    self.log("Temp DB created and populated")
  else
    raise "Failed to load DB #{self.configuration.db_config} into temp DB #{temp_db}."
  end

  self.log("mysqldump -h #{self.configuration.db_config['host']} -u #{self.configuration.db_config['username']} --password=XXXXXXXXX --no-data #{self.configuration.db_config['database']} #{self.configuration.tables_to_truncate.keys.join(' ')} | mysql -h #{self.configuration.db_config['host']}  -u #{self.configuration.db_config['username']} --password=XXXXXXXXX -D #{temp_db}")
  system("mysqldump -h #{self.configuration.db_config['host']} -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} --no-data #{self.configuration.db_config['database']} #{self.configuration.tables_to_truncate.keys.join(' ')} | mysql -h #{self.configuration.db_config['host']}  -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} -D #{temp_db}")
  if $?.exitstatus == 0
    self.log("Temp DB created and populated")
  else
    raise "Failed to load DB #{self.configuration.db_config} into temp DB #{temp_db}."
  end

  temp_db_config = self.configuration.db_config.dup
  temp_db_config['database'] = temp_db
  TempDatabaseConnection.establish_connection(temp_db_config)
  temp_db_connection = TempDatabaseConnection.connection

  [temp_db, temp_db_connection, temp_db_config]
end

.export_temp_db_to_file(dump_file, temp_db_config, temp_db) ⇒ Object



207
208
209
210
211
212
213
214
215
216
# File 'lib/active_sanitization.rb', line 207

def self.export_temp_db_to_file(dump_file, temp_db_config, temp_db)
  self.log("Dumping temp DB to #{dump_file}")
  system("mysqldump -h #{temp_db_config['host']} -u #{temp_db_config['username']} --password=#{temp_db_config['password']} #{temp_db} >> '#{dump_file}'")
  if $?.exitstatus == 0
    self.log("Dump created")
  else
    self.log("Failed to create dump")
    return
  end
end

.get_s3_bucketObject



179
180
181
182
# File 'lib/active_sanitization.rb', line 179

def self.get_s3_bucket
  resource = Aws::S3::Resource.new(client: get_s3_client)
  resource.bucket(self.configuration.s3_bucket)
end

.get_s3_clientObject



174
175
176
177
# File 'lib/active_sanitization.rb', line 174

def self.get_s3_client
  creds = Aws::Credentials.new(self.configuration.aws_access_key_id, self.configuration.aws_secret_access_key)
  Aws::S3::Client.new(credentials: creds, region: self.configuration.s3_bucket_region)
end

.gzip(dump_file) ⇒ Object



169
170
171
172
# File 'lib/active_sanitization.rb', line 169

def self.gzip(dump_file)
  self.log("Gzipping #{dump_file}")
  system("gzip '#{dump_file}'")
end

.hash_diff(hash1, hash2) ⇒ Object

Returns a hash that represents the difference between two hashes.

hash_diff({1 => 2}, {1 => 2})         # => {}
hash_diff({1 => 2}, {1 => 3})         # => {1 => 2}
hash_diff({}, {1 => 2})               # => {1 => 2}
hash_diff({1 => 2, 3 => 4}, {1 => 2}) # => {3 => 4}


47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/active_sanitization.rb', line 47

def self.hash_diff(hash1, hash2)
  difference1 = hash1.dup
  difference2 = hash2.dup

  difference1.delete_if do |key, value|
    hash2[key] == value
  end

  difference2.delete_if do |key, value|
    hash1.has_key?(key)
  end

  difference1.merge(difference2)
end

.import_data(env = nil, timestamp = nil) ⇒ Object



256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/active_sanitization.rb', line 256

def self.import_data(env = nil, timestamp = nil)
  env = "production" if env.nil?
  prefix = "#{self.configuration.app_name}/#{env}/mysql"

  bucket = get_s3_bucket
  if timestamp.nil?
    timestamp = bucket.objects(prefix: prefix).collect {|x| x.key[%r(#{prefix}\/(.*)\/), 1] }.max
  end

  # Check that there are files (as the user could have passed in an incorrect timestamp)
  if timestamp.nil?
    self.log("No mysql snapshot for timestamp #{prefix}/#{timestamp}")
    return
  end

  self.log('WARNING: this rake task will dump your MySQL DB to a file, then wipe your DB before importing a snapshot')
  local_dump_file = "#{File.join(self.configuration.root, "tmp")}/local_data.dump"

  # Make copy of local DB just in case something goes wrong
  system("mysqldump -h #{self.configuration.db_config['host']} -u #{self.configuration.db_config['username']} --password=#{self.configuration.db_config['password']} #{self.configuration.db_config['database']} > '#{local_dump_file}'")
  if $?.exitstatus == 0
    self.log("Local DB dump stored in #{local_dump_file}")
  else
    raise "Failed to create a local DB dump. If a previous local dump exists, please delete it and try again."
  end

  dump_file = "#{File.join(self.configuration.root, "tmp")}/data.dump"
  compressed_dump_file = "#{dump_file}.gz"

  name =  "#{prefix}/#{timestamp}/data.dump.gz"
  self.log("Downloading dump from bucket: #{self.configuration.s3_bucket}, path: #{name}")
  get_s3_client.get_object({ bucket:self.configuration.s3_bucket , key: name }, target: compressed_dump_file)

  # reset db
  self.log("Recreating your local DB")
  Rake::Task["db:drop"].invoke
  Rake::Task["db:create"].invoke

  # Import data
  self.log("Unzipping and importing data...")
  self.log("gunzip < #{compressed_dump_file} | mysql -u root #{self.configuration.db_config['database']}")
  system("gunzip < #{compressed_dump_file} | mysql -u root #{self.configuration.db_config['database']}")
  if $?.exitstatus == 0
    File.delete(compressed_dump_file) if File.exist?(compressed_dump_file)
  else
    raise "Could not load #{compressed_dump_file} into DB #{self.configuration.db_config}"
  end
  self.log('-- DONE --')
end

.is_dev_or_integration_env?Boolean

Returns:

  • (Boolean)


218
219
220
# File 'lib/active_sanitization.rb', line 218

def self.is_dev_or_integration_env?
  self.configuration.env == 'development' || self.configuration.env == 'integration'
end

.log(output) ⇒ Object



62
63
64
65
66
# File 'lib/active_sanitization.rb', line 62

def self.log(output)
  self.configuration.loggers.each do |logger|
    logger.info(output)
  end unless self.configuration.env == 'test'
end

.pre_sanitization_checksObject



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/active_sanitization.rb', line 68

def self.pre_sanitization_checks
  db_tables = {}
  self.configuration.active_record_connection.tables.each do |table_name|
    next if self.configuration.tables_to_ignore.include?(table_name)
    db_tables[table_name] = []
    self.configuration.active_record_connection.columns(table_name).each { |c| db_tables[table_name] << c.name }
    db_tables[table_name].sort!
  end

  # diff will only work correctly if the columns are sorted the same
  tables_with_sorted_columns = {}
  self.configuration.tables_to_sanitize.merge(self.configuration.tables_to_truncate).each { |k, v| tables_with_sorted_columns[k] = v.sort }
  table_difference = hash_diff(db_tables, tables_with_sorted_columns)
  checks = {}
  if table_difference != {}
    column_difference = {}
    table_difference.collect do |table_name, table_columns|
      column_difference[table_name] = table_columns - self.configuration.tables_to_sanitize.merge(self.configuration.tables_to_truncate)[table_name].to_a
    end
    checks[:pass] = false
    checks[:error] = "The following tables or columns have been found in the #{self.configuration.env} DB but are not known to this script (#{column_difference}).\n Please update the active_sanitization config!"
  else
    checks[:pass] = true
  end
  checks
end

.sanitize_and_export_dataObject



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/active_sanitization.rb', line 222

def self.sanitize_and_export_data
  checks = self.pre_sanitization_checks
  if checks[:pass]
    dump_file, compressed_dump_file = self.create_files
    self.clean_up_files(dump_file, compressed_dump_file)

    # If in dev or integration env we don't need to sanatise the DB so we should
    # just dump it to a file and upload
    if self.is_dev_or_integration_env?
      self.export_temp_db_to_file(dump_file, self.configuration.db_config, self.configuration.db_config["database"])
    else
      temp_db, temp_db_connection, temp_db_config = self.duplicate_database

      self.sanitize_tables(temp_db_connection)

      self.export_temp_db_to_file(dump_file, temp_db_config, temp_db)

      self.clean_up_temp_db(temp_db)
    end

    self.gzip(dump_file)

    if self.configuration.s3_bucket && self.configuration.aws_access_key_id && self.configuration.aws_secret_access_key
      self.upload(compressed_dump_file)
    else
      self.clean_up_files(dump_file, compressed_dump_file)
    end

    self.log("-- DONE --")
  else
    self.log(checks[:error])
  end
end

.sanitize_table(table, temp_db_connection) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/active_sanitization.rb', line 127

def self.sanitize_table(table, temp_db_connection)
  table_columns = temp_db_connection.select_values("DESCRIBE #{table};")
  self.configuration.sanitization_columns.keys.each do |column|
    if table_columns.include?(column)
      distinct_values = temp_db_connection.execute("SELECT DISTINCT(#{column}) FROM #{table};").collect { |data| data.first }
      distinct_values.each do |value|
        temp_db_connection.execute("UPDATE #{table} SET #{column}='#{self.configuration.sanitization_columns[column].sample}' WHERE #{column}=#{ActiveRecord::Base.sanitize(value)};")
      end
    end
  end

  # Run any custom sanitization for the table
  self.configuration.custom_sanitization.send("sanitize_#{table}", temp_db_connection) if self.configuration.custom_sanitization.respond_to?("sanitize_#{table}")
end

.sanitize_tables(temp_db_connection) ⇒ Object



150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/active_sanitization.rb', line 150

def self.sanitize_tables(temp_db_connection)
  self.log("Processing TABLES_TO_TRUNCATE...")
  self.configuration.tables_to_truncate.keys.each do |table|
     self.log("Truncating #{table}")
     temp_db_connection.execute("TRUNCATE #{table};")
  end

  self.log("Processing TABLES_TO_SANITIZE...")
  self.configuration.tables_to_sanitize.keys.each do |table|
    self.log("Sanitizing #{table}")
    self.sanitize_table(table, temp_db_connection)
  end
end

.upload(compressed_dump_file) ⇒ Object



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/active_sanitization.rb', line 184

def self.upload(compressed_dump_file)
  timestamp = DateTime.now.strftime('%Y%m%d%H%M%S')
  name = "#{self.configuration.app_name}/#{self.configuration.env}/mysql/#{timestamp}/#{File.basename(compressed_dump_file)}"
  self.log("Uploading to bucket: #{self.configuration.s3_bucket}, path: #{name}")
  file = File.open(compressed_dump_file, 'r')

  bucket = get_s3_bucket
  obj = bucket.object(name)
  obj.put(body: file)

  file.close
  File.unlink(compressed_dump_file)

  obj
end