Class: NdrPseudonymise::PseudonymisationSpecification

Inherits:
Object
  • Object
show all
Defined in:
lib/ndr_pseudonymise/pseudonymisation_specification.rb

Overview

Pseudonymise CSV data for matching purposes Sample format spec: => [[[0, ‘ ’]],

                       [[1, ' ', :upcase], [2, ' ', :upcase]]],
:columns => [
  {:title => 'nhsnumber', :maxlength => 12, :format => '\A[0-9A-Z]*\Z',
   :format_msg => 'Must contain only numbers, or numbers and letters for old NHS numbers',
  => 'dob', :format => '\A([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]|)\Z',
   :format_msg => 'Must have format YYYY-MM-DD, e.g. 2013-08-20',
   :canonical_title => 'birthdate',
  => 'postcode',
  => 'surname',
  => 'data1',
  => 'data2',
],
:demographics => [0, 1, 2, 3],

} – delete spaces, upcase, use columns 0+1, 0+2 as keys for core demographics – treat columns 0, 1, 2, 3 as demographics

Constant Summary collapse

KEY_BYTES =

length of randomly generated keys (32 bytes = 256 bits)

32
PREAMBLE_V1_STRIPED =
'Pseudonymised matching data v1.0-striped'.freeze
HEADER_ROW_PREFIX =
'HEADER_ROW'.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(format_spec, key_bundle) ⇒ PseudonymisationSpecification

Returns a new instance of PseudonymisationSpecification.

Raises:

  • (ArgumentError)


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 32

def initialize(format_spec, key_bundle)
  @format_spec = format_spec
  [:core_demographics, :columns, :demographics, :encrypt_clinical].each do |k|
    unless @format_spec.key?(k)
      raise(ArgumentError, "Expected format_spec to have a #{k.inspect} section")
    end
  end
  @format_spec[:columns].each_with_index do |col, i|
    raise(ArgumentError, "Expected format_spec to have a title for column #{i}") unless col.key?(:title)
  end
  unless [true, false].include?(@format_spec[:encrypt_clinical])
    raise(ArgumentError, 'Expected encrypt_clinical to be true or false')
  end
  @salt1 = key_bundle[:salt1]
  @salt2 = key_bundle[:salt2]
  raise(ArgumentError, 'Invalid salt1') unless @salt1 =~ /\A[0-9a-f]*\Z/ && @salt1.size >= 64
  raise(ArgumentError, 'Invalid salt2') unless @salt2 =~ /\A[0-9a-f]*\Z/ && @salt2.size >= 64
end

Class Method Details

.factory(format_spec, key_bundle) ⇒ Object

Builds a pseudonymiser with the preferred pseudonymisation class of the given format spec



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 52

def self.factory(format_spec, key_bundle)
  klass_name = format_spec[:pseudonymisation_class]
  if klass_name
    # Support existing format specifications.
    # (Pseudonymisation classes have now moved to NdrPseudonymise namespace.)
    klass_name = klass_name.sub!(/^Pseudonymisation::/, 'NdrPseudonymise::')
    klass = Object.const_get(klass_name)
    unless klass <= NdrPseudonymise::PseudonymisationSpecification
      raise(ArgumentError, "Invalid pseudonymisation_class #{klass_name}")
    end
  else
    klass = NdrPseudonymise::PseudonymisationSpecification
  end
  klass.new(format_spec, key_bundle)
end

.get_key_bundle(key_fname, admin_password) ⇒ Object



134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 134

def self.get_key_bundle(key_fname, admin_password)
  data = File.read(key_fname)
  aes = OpenSSL::Cipher.new('AES-256-CBC')
  aes.decrypt
  aes.key = Digest::SHA256.digest(admin_password)
  begin
    bundle = YAML.load(aes.update(Base64.decode64(data)) + aes.final)
    # Check that the bundle decoded successfully
    raise('Invalid bundle - not a hash') unless bundle.is_a?(Hash)
    bundle
  rescue # => e # Lint/UselessAssignment
    raise('Wrong password or invalid bundle')
  end
end

Instance Method Details

#all_demographics(row) ⇒ Object



149
150
151
152
153
154
155
156
157
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 149

def all_demographics(row)
  # TODO: What about rows with missing fields?
  result = []
  demographics_cols = @format_spec[:demographics]
  row.each_with_index do |x, i|
    result << x if demographics_cols.include?(i)
  end
  result
end

#clinical_data(row) ⇒ Object



159
160
161
162
163
164
165
166
167
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 159

def clinical_data(row)
  # TODO: What about rows with missing fields?
  result = []
  demographics_cols = @format_spec[:demographics]
  row.each_with_index do |x, i|
    result << x unless demographics_cols.include?(i)
  end
  result
end

#core_demographics(row) ⇒ Object

Returns arrays of core demographics field values, each of the form e.g. [[[‘nhsnumber’, ‘1234567881’]],

[['birthdate', '2010-08-21'], ['postcode', 'CB22 3AD']]]

Column titles can be remapped using a :canonical_title entry, to ensure consistent pseudo_ids even when column titles are predefined.



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 77

def core_demographics(row)
  @format_spec[:core_demographics].collect do |fields|
    fields.collect do |col_num, delchar, modifier|
      val = row[col_num].to_s
      val = val.to_s.delete(delchar) if delchar
      case modifier
      when :upcase
        val = val.upcase
      when nil
      else
        raise "Unknown modifier #{modifier.inspect} for core_demographics"
      end
      row_spec = @format_spec[:columns][col_num]
      [row_spec[:canonical_title] || row_spec[:title], val]
    end
  end
end

#csv_header_rowObject

Header row for CSV data



268
269
270
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 268

def csv_header_row
  [PREAMBLE_V1_STRIPED]
end

#data_hash(value, salt) ⇒ Object



108
109
110
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 108

def data_hash(value, salt)
  Digest::SHA2.hexdigest(value.to_s + salt.to_s)
end

#decrypt_data(data, pseudo_id, partial_crypt_key, salt) ⇒ Object



126
127
128
129
130
131
132
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 126

def decrypt_data(data, pseudo_id, partial_crypt_key, salt)
  key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
  aes = OpenSSL::Cipher.new('AES-256-CBC')
  aes.decrypt
  aes.key = Digest::SHA256.digest(key)
  aes.update(Base64.strict_decode64(data)) + aes.final
end

#decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname) ⇒ Object

Decrypt public key encrypted data to a CSV file encrypted_data can be an open IO object (a file), or an array of data rows out_data can be an open IO object or a StringIO – CSV data is output



371
372
373
374
375
376
377
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 371

def decrypt_to_csv(encrypted_data, out_data, public_key_fname, private_key_fname)
  rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname),
                                File.read(private_key_fname))
  encrypted_data.each do |crypto_data|
    out_data << rsa_aes_cbc.decrypt(crypto_data)
  end
end

#emit_csv_rows(out_csv, pseudonymised_row) ⇒ Object

Append the output of pseudonymise_row to a CSV file



273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 273

def emit_csv_rows(out_csv, pseudonymised_row)
  (index_rows, demographics_rows, clinical_rows) = pseudonymised_row
  unless index_rows.size == demographics_rows.size &&
         index_rows.size == clinical_rows.size
    raise(ArgumentError, <<-ERROR
Mismatch in number of index_rows (#{index_rows.size})
vs demographics_rows (#{demographics_rows.size})
vs clinical_rows (#{clinical_rows.size})
ERROR
        )
  end

  index_rows.zip(demographics_rows).zip(clinical_rows).collect do |(index_row, demographics_row), clinical_row|
    # Alternate each of 3 data types into 1 output file
    out_csv << index_row
    out_csv << demographics_row
    out_csv << clinical_row
  end
end

#encrypt_data(data, pseudo_id, partial_crypt_key, salt) ⇒ Object



112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 112

def encrypt_data(data, pseudo_id, partial_crypt_key, salt)
  if [pseudo_id, partial_crypt_key, salt].any? { |s| s.to_s.blank? }
    raise(ArgumentError, 'Expected all key arguments to be non-blank')
  end
  key = "#{pseudo_id}#{partial_crypt_key}#{salt}"
  # unless key =~ /\A[0-9a-f]+\Z/
  #  raise(ArgumentError, 'Expected key to be all hex characters (0-9, a-f)')
  # end
  aes = OpenSSL::Cipher.new('AES-256-CBC')
  aes.encrypt
  aes.key = Digest::SHA256.digest(key)
  Base64.strict_encode64(aes.update(data) + aes.final)
end

#header_row?(row) ⇒ Boolean

Return true if this row is a valid header row, according to the spec

Returns:

  • (Boolean)


209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 209

def header_row?(row)
  expected_keys = @format_spec[:columns].collect { |col| col[:title] }
  row_keys = row.collect(&:downcase)
  if (row_keys & expected_keys).size >= 3 # at least 3 common keys
    if row_keys == expected_keys
      true # Only expected keys, in right order
    else
      raise(ArgumentError, "Error: invalid header row; expected keys #{expected_keys.inspect}, actually #{row_keys.inspect}")
    end
  else
    false
  end
end

#pseudo_id(real_id) ⇒ Object

Convert a real id to a pseudonymised id



104
105
106
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 104

def pseudo_id(real_id)
  data_hash(real_id, @salt1)
end

#pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil) ⇒ Object

csv_data can be an open IO object (a CSV file), or an array of data rows out_data can be an open IO object or a StringIO – CSV data is output public_key_fname supports public key encryption of the output progress_monitor is an object for reporting progress, that responds to

log_progress(start_time, time_now, csv_row, progress, total)

where progress and total are in the same units, either bytes or rows



299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 299

def pseudonymise_csv(csv_data, out_data, public_key_fname = nil, progress_monitor = nil)
  csv_lib = CSV
  if csv_data.is_a?(IO) || csv_data.is_a?(StringIO)
    csv = csv_lib.new(csv_data)
  elsif csv_data.is_a?(Array)
    csv = csv_data
  else
    raise(ArgumentError, 'Expected an IO or Array of rows, not a filename for csv_data')
  end

  if public_key_fname
    unless File.exist?(public_key_fname)
      raise(ArgumentError, "Missing public key file: #{public_key_fname}")
    end
    rsa_aes_cbc = RSA_AES_CBC.new(File.read(public_key_fname), nil)
  end

  unless out_data.respond_to?('<<')
    raise(ArgumentError, 'Expected an IO or writeable structure for out_data')
  end
  out_buff = StringIO.new
  out_csv = csv_lib.new(out_buff)
  out_csv << csv_header_row
  out_buff.rewind
  out_data <<
    if public_key_fname
      rsa_aes_cbc.encrypt(out_buff.read) + "\n"
    else
      out_buff.read
    end

  i = 0
  t0 = Time.current
  csv_size = progress_monitor && csv_data.size
  csv.each do |row|
    out_buff = StringIO.new
    out_csv = csv_lib.new(out_buff)
    i += 1
    if i == 1 && header_row?(row)
      # Preserve header row in output
      out_csv << [HEADER_ROW_PREFIX] + row
    else
      errs = row_errors(row)
      raise("Invalid row #{i}: #{errs}") if errs
      begin
        emit_csv_rows(out_csv, pseudonymise_row(row))
      rescue ArgumentError, RuntimeError => e
        raise(ArgumentError, "Invalid row #{i}: #{e}", e.backtrace)
      end
    end
    out_buff.rewind
    out_data <<
      if public_key_fname
        rsa_aes_cbc.encrypt(out_buff.read) + "\n"
      else
        out_buff.read
      end

    # Current runs at about 325 rows per second for prescription data 2016-05-09 ruby 2.3.1
    # so try to log progress about every 15 seconds
    if (i % 5000) == 0 && progress_monitor
      progress_monitor.log_progress(t0, Time.current, i, csv.is_a?(Array) ? i : csv.pos, csv_size)
    end
  end
  if (i % 5000) != 0 && progress_monitor
    progress_monitor.log_progress(t0, Time.current, i, csv_size, csv_size)
  end
end

#pseudonymise_row(row) ⇒ Object

Pseudonymise a row of data, returning 3 sets of rows:

index_rows, demographics_rows, clinical_rows


171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 171

def pseudonymise_row(row)
  index_rows = []
  demographics_rows = []
  clinical_rows = []
  real_ids(row).each do |real_id|
    pseudo = pseudo_id(real_id)
    row_key = random_key
    partial_crypt_key1 = random_key # middle bit of crypto key
    if @format_spec[:encrypt_clinical]
      partial_crypt_key2 = random_key # middle bit of crypto key
      index_rows << [pseudo, row_key, partial_crypt_key1, partial_crypt_key2]
    else
      index_rows << [pseudo, row_key, partial_crypt_key1]
    end
    # demographics and clinical files only have non-information-bearing keys
    demographics_rows << [row_key,
                          encrypt_data(safe_json(all_demographics(row)),
                                       pseudo, partial_crypt_key1, @salt2)]
    safe_clinical = safe_json(clinical_data(row))
    if @format_spec[:encrypt_clinical]
      safe_clinical = encrypt_data(safe_clinical,
                                   pseudo, partial_crypt_key2, @salt2)
    end
    clinical_rows << [row_key, safe_clinical]
  end
  [index_rows, demographics_rows, clinical_rows]
end

#random_keyObject



68
69
70
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 68

def random_key
  SecureRandom.hex(KEY_BYTES)
end

#real_ids(row) ⇒ Object

List of pseudonymised ids, based on this row’s core demographics + salt1



96
97
98
99
100
101
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 96

def real_ids(row)
  core_demographics(row).collect do |fields|
    (fields.collect(&:first) +
     fields.collect(&:last)).collect { |s| s.gsub('_', '__') }.join('_')
  end
end

#row_errors(row) ⇒ Object

Return false if this row is a valid data row, otherwise a list of errors



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 224

def row_errors(row)
  @check_cols ||= begin
                    check_cols = []
                    @format_spec[:columns].each_with_index do |col, i|
                      # Unpack column checking meta-data proactively
                      if col[:maxlength] || col[:format]
                        check_cols << [col, i, col[:maxlength],
                                       col[:format] && Regexp.new(col[:format])]
                      end
                    end
                    check_cols
                  end
  @dmax ||= @format_spec[:core_demographics].flatten(1).collect(&:first).max
  if row.size <= @dmax + 1
    "Missing core demographics: at least #{@dmax} columns expected"
  elsif row[@format_spec[:columns].size..-1].to_a.any? { |s| !s.blank? }
    "Too many columns (#{row.size}); expected #{@format_spec[:columns].size}"
  else
    # Check field formats
    errs = []
    @check_cols.each do |col, i, col_maxlength, col_format_re|
      val = row[i].to_s # Missing columns treated as blank
      if col_maxlength && val.size > col_maxlength
        errs << "Field #{col[:title]} (column #{i + 1}) is longer than maxlength #{col[:maxlength]}."
      end
      if col_format_re
        unless col_format_re.match(val)
          if col[:format_msg]
            errs << "Field #{col[:title]} (column #{i + 1}) #{col[:format_msg]} -- invalid value: #{val}"
          else
            errs << "Field #{col[:title]} (column #{i + 1}) does not match format #{col[:format].inspect} -- invalid value: #{val}"
          end
        end
      end
    end
    if errs.empty?
      false
    else
      errs.join(', ')
    end
  end
end

#safe_json(data) ⇒ Object

Convert data to json, but raise exception if it won’t safely deserialise



200
201
202
203
204
205
206
# File 'lib/ndr_pseudonymise/pseudonymisation_specification.rb', line 200

def safe_json(data)
  result = data.to_json
  unless data == JSON.load(result)
    raise(ArgumentError, "Expected consistent JSON serialisation of #{data.inspect}")
  end
  result
end