Class: DeathMasterFileLoader
- Inherits:
-
Object
- Object
- DeathMasterFileLoader
- Defined in:
- app/models/death_master_file_loader.rb
Class Method Summary collapse
-
.load_update_files_from_web ⇒ Object
Loads all the update files from dmf.ntis.gov.
Instance Method Summary collapse
-
#convert_file_to_csv {|"File conversion ran for #{(Time.now - start) / 60.0} minutes."| ... } ⇒ Object
Processes 28 million rows in 23 minutes.
- #get_file_from_web ⇒ Object
-
#initialize(path_or_url, file_as_of) ⇒ DeathMasterFileLoader
constructor
path_or_url is the full path to the file to load on disk, or the url of an update file.
- #load_file ⇒ Object
- #valid? ⇒ Boolean
Constructor Details
#initialize(path_or_url, file_as_of) ⇒ DeathMasterFileLoader
path_or_url is the full path to the file to load on disk, or the url of an update file. as_of is a string in the format YYYY-MM-DD for which the file data is accurate.
9 10 11 12 13 |
# File 'app/models/death_master_file_loader.rb', line 9 def initialize(path_or_url, file_as_of) @file_path_or_url = path_or_url @file_as_of = file_as_of valid? { |status| yield status if block_given? } end |
Class Method Details
.load_update_files_from_web ⇒ Object
Loads all the update files from dmf.ntis.gov. It starts with the last file loaded, and loads each missing file in sequence up to the current file.
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'app/models/death_master_file_loader.rb', line 68 def self.load_update_files_from_web max_as_of = DeathMasterFile.maximum(:as_of) run_file_date = max_as_of.beginning_of_month.next_month last_file_date = Date.today.beginning_of_month while run_file_date <= last_file_date url = "https://dmf.ntis.gov/dmldata/monthly/MA#{run_file_date.strftime('%y%m%d')}" puts "Loading file #{url}" yield "Loading file #{url}" if block_given? dmf = DeathMasterFileLoader.new(url, run_file_date.strftime('%Y-%m-%d')) { |status| yield status if block_given? } dmf.load_file do |status| yield status if block_given? end run_file_date += 1.month end end |
Instance Method Details
#convert_file_to_csv {|"File conversion ran for #{(Time.now - start) / 60.0} minutes."| ... } ⇒ Object
Processes 28 million rows in 23 minutes. Input file 2.6GB output: 2.9GB. Used to convert a packed fixed-length file into csv for mysql import.
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'app/models/death_master_file_loader.rb', line 86 def convert_file_to_csv csv_file = Tempfile.new('dmf') # create temp file for converted csv formmat. start = Time.now timenow = start.to_s(:db) @delete_ssns = [] @download_file.each_with_index do |line, i| action = record_action(line) attributes_hash = text_to_hash(line) if action == 'D' #keep track of all the records to delete. We'll delete at the end all at once. @delete_ssns << attributes_hash[:social_security_number] else # empty field for id to be generated by mysql. newline = "``," + # social_security_number "`#{attributes_hash[:social_security_number]}`," + # last_name "`#{attributes_hash[:last_name]}`," + # name_suffix "`#{attributes_hash[:name_suffix]}`," + # first_name "`#{attributes_hash[:first_name]}`," + # middle_name "`#{attributes_hash[:middle_name]}`," + # verify_proof_code "`#{attributes_hash[:verify_proof_code]}`," + # date_of_death - need YYYY-MM-DD. "`#{attributes_hash[:date_of_death]}`," + # date_of_birth - need YYYY-MM-DD. "`#{attributes_hash[:date_of_birth]}`," + # state_of_residence - must be code between 01 and 65 or else nil. "`#{attributes_hash[:state_of_residence]}`," + # last_known_zip_residence "`#{attributes_hash[:last_known_zip_residence]}`," + # last_known_zip_payment "`#{attributes_hash[:last_known_zip_payment]}`," + # created_at "`#{timenow}`," + # updated_at "`#{timenow}`," + # as_of "`#{attributes_hash[:as_of]}`" +"\n" csv_file.syswrite newline if (i % 25000 == 0) && (i > 0) puts "#{i} records processed." yield "#{i} records processed." if block_given? end end end puts "File conversion ran for #{(Time.now - start) / 60.0} minutes." yield "File conversion ran for #{(Time.now - start) / 60.0} minutes." if block_given? csv_file end |
#get_file_from_web ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'app/models/death_master_file_loader.rb', line 40 def get_file_from_web uri = URI.parse(@file_path_or_url) request = Net::HTTP::Get.new(uri.request_uri) request.basic_auth(SsnValidator::Ntis.user_name, SsnValidator::Ntis.password) proxy_addr, proxy_port = ENV['http_proxy'].gsub('http://', '').split(/:/) if ENV['http_proxy'] proxy_user, proxy_pass = uri.userinfo.split(/:/) if uri.userinfo http = Net::HTTP.Proxy(proxy_addr, proxy_port, proxy_user, proxy_pass).new(uri.host, uri.port) http.use_ssl = (uri.port == 443) http.verify_mode = OpenSSL::SSL::VERIFY_NONE csv_file = Tempfile.new(@file_path_or_url.split('/').last) # create temp file for the raw file. http.request(request) do |res| raise(ArgumentError, "Invalid URL: #{@file_path_or_url}") if res.kind_of?(Net::HTTPNotFound) raise(ArgumentError, 'Authorization Required: Invalid username or password. Set the variables SsnValidator::Ntis.user_name and SsnValidator::Ntis.password in your environment.rb file.') if res.kind_of?(Net::HTTPUnauthorized) size, total = 0, res.header['Content-Length'].to_i res.read_body do |chunk| size += chunk.size csv_file.write chunk puts '%d%% done (%d of %d)' % [(size * 100) / total, size, total] yield('%d%% done (%d of %d)' % [(size * 100) / total, size, total]) if block_given? end end csv_file.rewind csv_file.path end |
#load_file ⇒ Object
29 30 31 32 33 34 35 36 37 38 |
# File 'app/models/death_master_file_loader.rb', line 29 def load_file if ActiveRecord::Base.connection.instance_values['config'][:adapter].to_s.match(/mysql|jdbc/) puts 'Converting file to csv format for Mysql import. This could take several minutes.' yield 'Converting file to csv format for Mysql import. This could take several minutes.' if block_given? csv_file = convert_file_to_csv { |status| yield status if block_given? } bulk_mysql_update(csv_file) { |status| yield status if block_given? } else active_record_file_load { |status| yield status if block_given? } end end |
#valid? ⇒ Boolean
15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'app/models/death_master_file_loader.rb', line 15 def valid? raise(ArgumentError, 'path_or_url not specified') unless @file_path_or_url raise(ArgumentError, 'as_of not specified') unless @file_as_of max_as_of = DeathMasterFile.maximum(:as_of) raise(ArgumentError, "A more recent file has already been processed. DB as_of date #{max_as_of}") if max_as_of && (max_as_of >= @file_as_of.to_date) if File.exists?(@file_path_or_url) @download_file = File.open(@file_path_or_url) elsif URI.parse(@file_path_or_url).kind_of?(URI::HTTP) @download_file = File.open(get_file_from_web { |status| yield status if block_given? }) else raise(Errno::ENOENT, @file_path_or_url) end end |