Class: DeathMasterFileLoader

Inherits:
Object
  • Object
show all
Defined in:
app/models/death_master_file_loader.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path_or_url, file_as_of) ⇒ DeathMasterFileLoader

path_or_url is the full path to the file to load on disk, or the url of an update file. as_of is a string in the format YYYY-MM-DD for which the file data is accurate.



9
10
11
12
13
# File 'app/models/death_master_file_loader.rb', line 9

def initialize(path_or_url, file_as_of)
  @file_path_or_url = path_or_url
  @file_as_of = file_as_of
  valid? { |status| yield status if block_given? }
end

Class Method Details

.load_update_files_from_webObject

Loads all the update files from dmf.ntis.gov. It starts with the last file loaded, and loads each missing file in sequence up to the current file.



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'app/models/death_master_file_loader.rb', line 68

def self.load_update_files_from_web
  max_as_of = DeathMasterFile.maximum(:as_of)
  run_file_date = max_as_of.beginning_of_month.next_month
  last_file_date = Date.today.beginning_of_month
  while run_file_date <= last_file_date
    url = "https://dmf.ntis.gov/dmldata/monthly/MA#{run_file_date.strftime('%y%m%d')}"
    puts "Loading file #{url}"
    yield "Loading file #{url}" if block_given?
    dmf = DeathMasterFileLoader.new(url, run_file_date.strftime('%Y-%m-%d')) { |status| yield status if block_given? }
    dmf.load_file do |status|
      yield status if block_given?
    end
    run_file_date += 1.month
  end
end

Instance Method Details

#convert_file_to_csv {|"File conversion ran for #{(Time.now - start) / 60.0} minutes."| ... } ⇒ Object

Processes 28 million rows in 23 minutes. Input file 2.6GB output: 2.9GB. Used to convert a packed fixed-length file into csv for mysql import.

Yields:

  • ("File conversion ran for #{(Time.now - start) / 60.0} minutes.")


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'app/models/death_master_file_loader.rb', line 86

def convert_file_to_csv
  csv_file = Tempfile.new('dmf') # create temp file for converted csv formmat.
  start = Time.now
  timenow = start.to_s(:db)
  @delete_ssns = []
  @download_file.each_with_index do |line, i|
    action = record_action(line)
    attributes_hash = text_to_hash(line)
    if  action == 'D'
      #keep track of all the records to delete.  We'll delete at the end all at once.
      @delete_ssns << attributes_hash[:social_security_number]
    else
      # empty field for id to be generated by mysql.
      newline = "``," +
          # social_security_number
          "`#{attributes_hash[:social_security_number]}`," +
          # last_name
          "`#{attributes_hash[:last_name]}`," +
          # name_suffix
          "`#{attributes_hash[:name_suffix]}`," +
          # first_name
          "`#{attributes_hash[:first_name]}`," +
          # middle_name
          "`#{attributes_hash[:middle_name]}`," +
          # verify_proof_code
          "`#{attributes_hash[:verify_proof_code]}`," +
          # date_of_death - need YYYY-MM-DD.
          "`#{attributes_hash[:date_of_death]}`," +
          # date_of_birth - need YYYY-MM-DD.
          "`#{attributes_hash[:date_of_birth]}`," +
          # state_of_residence - must be code between 01 and 65 or else nil.
          "`#{attributes_hash[:state_of_residence]}`," +
          # last_known_zip_residence
          "`#{attributes_hash[:last_known_zip_residence]}`," +
          # last_known_zip_payment
          "`#{attributes_hash[:last_known_zip_payment]}`," +
          # created_at
          "`#{timenow}`," +
          # updated_at
          "`#{timenow}`," +
          # as_of
          "`#{attributes_hash[:as_of]}`" +"\n"
      csv_file.syswrite newline
      if (i % 25000 == 0) && (i > 0)
        puts "#{i} records processed."
        yield "#{i} records processed." if block_given?
      end
    end
  end
  puts "File conversion ran for #{(Time.now - start) / 60.0} minutes."
  yield "File conversion ran for #{(Time.now - start) / 60.0} minutes." if block_given?
  csv_file
end

#get_file_from_webObject



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'app/models/death_master_file_loader.rb', line 40

def get_file_from_web
  uri = URI.parse(@file_path_or_url)
  request = Net::HTTP::Get.new(uri.request_uri)
  request.basic_auth(SsnValidator::Ntis.user_name, SsnValidator::Ntis.password)
  proxy_addr, proxy_port = ENV['http_proxy'].gsub('http://', '').split(/:/) if ENV['http_proxy']
  proxy_user, proxy_pass = uri.userinfo.split(/:/) if uri.userinfo
  http = Net::HTTP.Proxy(proxy_addr, proxy_port, proxy_user, proxy_pass).new(uri.host, uri.port)
  http.use_ssl = (uri.port == 443)
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  csv_file = Tempfile.new(@file_path_or_url.split('/').last) # create temp file for the raw file.
  http.request(request) do |res|
    raise(ArgumentError, "Invalid URL: #{@file_path_or_url}") if res.kind_of?(Net::HTTPNotFound)
    raise(ArgumentError, 'Authorization Required: Invalid username or password.  Set the variables SsnValidator::Ntis.user_name and SsnValidator::Ntis.password in your environment.rb file.') if res.kind_of?(Net::HTTPUnauthorized)
    size, total = 0, res.header['Content-Length'].to_i
    res.read_body do |chunk|
      size += chunk.size
      csv_file.write chunk
      puts '%d%% done (%d of %d)' % [(size * 100) / total, size, total]
      yield('%d%% done (%d of %d)' % [(size * 100) / total, size, total]) if block_given?
    end
  end
  csv_file.rewind
  csv_file.path
end

#load_fileObject



29
30
31
32
33
34
35
36
37
38
# File 'app/models/death_master_file_loader.rb', line 29

def load_file
  if ActiveRecord::Base.connection.instance_values['config'][:adapter].to_s.match(/mysql|jdbc/)
    puts 'Converting file to csv format for Mysql import.  This could take several minutes.'
    yield 'Converting file to csv format for Mysql import.  This could take several minutes.' if block_given?
    csv_file = convert_file_to_csv { |status| yield status if block_given? }
    bulk_mysql_update(csv_file) { |status| yield status if block_given? }
  else
    active_record_file_load { |status| yield status if block_given? }
  end
end

#valid?Boolean

Returns:

  • (Boolean)

Raises:

  • (ArgumentError)


15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'app/models/death_master_file_loader.rb', line 15

def valid?
  raise(ArgumentError, 'path_or_url not specified') unless @file_path_or_url
  raise(ArgumentError, 'as_of not specified') unless @file_as_of
  max_as_of = DeathMasterFile.maximum(:as_of)
  raise(ArgumentError, "A more recent file has already been processed.  DB as_of date #{max_as_of}") if  max_as_of && (max_as_of >= @file_as_of.to_date)
  if File.exists?(@file_path_or_url)
    @download_file = File.open(@file_path_or_url)
  elsif URI.parse(@file_path_or_url).kind_of?(URI::HTTP)
    @download_file = File.open(get_file_from_web { |status| yield status if block_given? })
  else
    raise(Errno::ENOENT, @file_path_or_url)
  end
end