Class: Witsec::Anonymizer

Inherits:
Object
  • Object
show all
Defined in:
lib/witsec/anonymizer.rb

Constant Summary collapse

BATCH_SIZE =
1000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAnonymizer

Returns a new instance of Anonymizer.



5
6
7
8
9
# File 'lib/witsec/anonymizer.rb', line 5

def initialize
  @schema = instance_eval(File.read("config/witsec/schema.rb"))

  check_input_and_output_are_different
end

Instance Attribute Details

#schemaObject (readonly)

Returns the value of attribute schema.



11
12
13
# File 'lib/witsec/anonymizer.rb', line 11

def schema
  @schema
end

Instance Method Details

#anonymizeObject

TODO: Make silence configurable



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/witsec/anonymizer.rb', line 14

def anonymize
  time = Benchmark.measure do
    ActiveRecord::Base.logger.silence do
      clear_output_database

      ActiveRecord::Base.connection.tables.each do |table_name|
        if schema.anonymizes?(table_name)
          # A performance improvement could probably be found here, if we just passed along included tables (as in tables, where no rows are anonymized) without querying etc.

          input_connection = input_connection_pool.lease_connection
          record_rows = input_connection.execute("SELECT * FROM #{table_name}").to_a
          columns = record_rows&.first&.keys
          rows = record_rows.map(&:values)
          puts "Anonymizing #{table_name} (#{rows.size} rows)"
          input_connection_pool.release_connection

          anonymized_rows = Witsec::Alias.new(table_name, columns:, schema:).anonymize(rows)
          output_connection = output_connection_pool.lease_connection
          # If referential integrity is not disabled, you have to create all rows in the correct order
          output_connection.disable_referential_integrity do
            # Use insert for performance
            row_batches = anonymized_rows.in_groups_of(BATCH_SIZE, false)
            total = 0
            row_batches.each_with_index do |batch, index|
              print "Anonymizing up to row #{total + batch.size} of #{rows.size}\r"
              total += batch.size
              values = batch.map do |row|
                "(#{row.map { |value| ActiveRecord::Base.connection.quote(value) }.join(", ")})"
              end.join(", ")

              output_connection.execute(
                "INSERT INTO #{table_name} (#{columns.join(", ")}) VALUES #{values}"
              )
            end
          end

          output_connection_pool.release_connection
        else
          puts "Skipping #{table_name}"
          next
        end
      end
    end
  end
  puts "Anonymized all in #{time.real} seconds"
end

#clear_output_databaseObject



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/witsec/anonymizer.rb', line 61

def clear_output_database
  puts "Clearing output database"

  ActiveRecord::Base.logger.silence do
    connection = output_connection_pool.lease_connection

    connection.disable_referential_integrity do
      connection.tables.each do |table_name|
        connection.execute("TRUNCATE TABLE #{table_name} CASCADE")
      end
    end

    output_connection_pool.release_connection
  end
end