Class: WaybackMachineDownloader
- Inherits:
-
Object
- Object
- WaybackMachineDownloader
- Defined in:
- lib/wayback_machine_downloader.rb
Constant Summary collapse
- VERSION =
"0.1.16"
Instance Attribute Summary collapse
-
#base_url ⇒ Object
Returns the value of attribute base_url.
-
#timestamp ⇒ Object
Returns the value of attribute timestamp.
Instance Method Summary collapse
- #backup_name ⇒ Object
- #backup_path ⇒ Object
- #download_files ⇒ Object
- #file_list_by_timestamp ⇒ Object
- #get_file_list_curated ⇒ Object
-
#initialize(params) ⇒ WaybackMachineDownloader
constructor
A new instance of WaybackMachineDownloader.
- #structure_dir_path(dir_path) ⇒ Object
Constructor Details
#initialize(params) ⇒ WaybackMachineDownloader
Returns a new instance of WaybackMachineDownloader.
11 12 13 14 |
# File 'lib/wayback_machine_downloader.rb', line 11 def initialize params @base_url = params[:base_url] @timestamp = params[:timestamp].to_i end |
Instance Attribute Details
#base_url ⇒ Object
Returns the value of attribute base_url.
9 10 11 |
# File 'lib/wayback_machine_downloader.rb', line 9 def base_url @base_url end |
#timestamp ⇒ Object
Returns the value of attribute timestamp.
9 10 11 |
# File 'lib/wayback_machine_downloader.rb', line 9 def @timestamp end |
Instance Method Details
#backup_name ⇒ Object
16 17 18 |
# File 'lib/wayback_machine_downloader.rb', line 16 def backup_name @base_url.split('/')[2] end |
#backup_path ⇒ Object
20 21 22 |
# File 'lib/wayback_machine_downloader.rb', line 20 def backup_path 'websites/' + backup_name + '/' end |
#download_files ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/wayback_machine_downloader.rb', line 61 def download_files puts "Downlading #{@base_url} to #{backup_path} from Wayback Machine..." puts file_list_curated = get_file_list_curated count = 0 .each do |file_remote_info| count += 1 file_url = file_remote_info[:file_url] file_id = file_remote_info[:file_id] file_path_elements = file_id.split('/') if file_id == "" dir_path = backup_path file_path = backup_path + 'index.html' elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' dir_path = backup_path + file_path_elements[0..-1].join('/') file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html' else dir_path = backup_path + file_path_elements[0..-2].join('/') file_path = backup_path + file_path_elements[0..-1].join('/') end unless File.exists? file_path begin structure_dir_path dir_path open(file_path, "wb") do |file| begin open("http://web.archive.org/web/#{}id_/#{file_url}") do |uri| file.write(uri.read) end rescue OpenURI::HTTPError => e puts "#{file_url} # #{e}" file.write(e.io.read) rescue StandardError => e puts "#{file_url} # #{e}" end end rescue StandardError => e puts "#{file_url} # #{e}" end puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})" else puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})" end end puts puts "Download complete, saved in #{backup_path} (#{file_list_curated.size} files)" end |
#file_list_by_timestamp ⇒ Object
52 53 54 55 56 57 58 59 |
# File 'lib/wayback_machine_downloader.rb', line 52 def file_list_curated = get_file_list_curated file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse file_list_curated.map do |file_remote_info| file_remote_info[1][:file_id] = file_remote_info[0] file_remote_info[1] end end |
#get_file_list_curated ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/wayback_machine_downloader.rb', line 24 def get_file_list_curated index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}" all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" file_list_curated = Hash.new [index_file_list_raw, all_file_list_raw].each do |file| file.each_line do |line| line = line.split(' ') = line[1].to_i file_url = line[2] file_id = file_url.split('/')[3..-1].join('/') file_id = URI.unescape file_id file_id = file_id.tidy_bytes unless file_id == "" if file_id.nil? puts "Malformed file url, ignoring: #{file_url}" elsif @timestamp == 0 or <= @timestamp if file_list_curated[file_id] unless file_list_curated[file_id][:timestamp] > file_list_curated[file_id] = {file_url: file_url, timestamp: } end else file_list_curated[file_id] = {file_url: file_url, timestamp: } end end end end file_list_curated end |
#structure_dir_path(dir_path) ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/wayback_machine_downloader.rb', line 108 def structure_dir_path dir_path begin FileUtils::mkdir_p dir_path unless File.exists? dir_path rescue Errno::EEXIST => e puts "# #{e}" file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1] file_already_existing_temporary = file_already_existing + '.temp' file_already_existing_permanent = file_already_existing + '/index.html' FileUtils::mv file_already_existing, file_already_existing_temporary FileUtils::mkdir_p file_already_existing FileUtils::mv file_already_existing_temporary, file_already_existing_permanent puts "#{file_already_existing} -> #{file_already_existing_permanent}" structure_dir_path dir_path end end |