Class: WaybackMachineDownloader

Inherits:
Object
  • Object
show all
Defined in:
lib/wayback_machine_downloader.rb

Constant Summary collapse

VERSION =
"0.1.16"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(params) ⇒ WaybackMachineDownloader

Returns a new instance of WaybackMachineDownloader.



11
12
13
14
# File 'lib/wayback_machine_downloader.rb', line 11

def initialize params
  @base_url = params[:base_url]
  @timestamp = params[:timestamp].to_i
end

Instance Attribute Details

#base_urlObject

Returns the value of attribute base_url.



9
10
11
# File 'lib/wayback_machine_downloader.rb', line 9

def base_url
  @base_url
end

#timestampObject

Returns the value of attribute timestamp.



9
10
11
# File 'lib/wayback_machine_downloader.rb', line 9

def timestamp
  @timestamp
end

Instance Method Details

#backup_nameObject



16
17
18
# File 'lib/wayback_machine_downloader.rb', line 16

def backup_name
  @base_url.split('/')[2]
end

#backup_pathObject



20
21
22
# File 'lib/wayback_machine_downloader.rb', line 20

def backup_path
  'websites/' + backup_name + '/'
end

#download_filesObject



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/wayback_machine_downloader.rb', line 61

def download_files
  puts "Downlading #{@base_url} to #{backup_path} from Wayback Machine..."
  puts
  file_list_curated = get_file_list_curated
  count = 0
  file_list_by_timestamp.each do |file_remote_info|
    count += 1
    file_url = file_remote_info[:file_url]
    file_id = file_remote_info[:file_id]
    file_path_elements = file_id.split('/')
    if file_id == ""
      dir_path = backup_path
      file_path = backup_path + 'index.html'
    elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
      dir_path = backup_path + file_path_elements[0..-1].join('/')
      file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
    else
      dir_path = backup_path + file_path_elements[0..-2].join('/')
      file_path = backup_path + file_path_elements[0..-1].join('/')
    end
    unless File.exists? file_path
      begin
        structure_dir_path dir_path
        open(file_path, "wb") do |file|
          begin
            open("http://web.archive.org/web/#{timestamp}id_/#{file_url}") do |uri|
              file.write(uri.read)
            end
          rescue OpenURI::HTTPError => e
            puts "#{file_url} # #{e}"
            file.write(e.io.read)
          rescue StandardError => e
            puts "#{file_url} # #{e}"
          end
        end
      rescue StandardError => e
        puts "#{file_url} # #{e}"
      end
      puts "#{file_url} -> #{file_path} (#{count}/#{file_list_curated.size})"
    else
      puts "#{file_url} # #{file_path} already exists. (#{count}/#{file_list_curated.size})"
    end
  end
  puts
  puts "Download complete, saved in #{backup_path} (#{file_list_curated.size} files)"
end

#file_list_by_timestampObject



52
53
54
55
56
57
58
59
# File 'lib/wayback_machine_downloader.rb', line 52

def file_list_by_timestamp
  file_list_curated = get_file_list_curated
  file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
  file_list_curated.map do |file_remote_info|
    file_remote_info[1][:file_id] = file_remote_info[0]
    file_remote_info[1]
  end
end

#get_file_list_curatedObject



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/wayback_machine_downloader.rb', line 24

def get_file_list_curated
  index_file_list_raw =  open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
  all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
  file_list_curated = Hash.new
  [index_file_list_raw, all_file_list_raw].each do |file|
    file.each_line do |line|
      line = line.split(' ')
      file_timestamp = line[1].to_i
      file_url = line[2]
      file_id = file_url.split('/')[3..-1].join('/')
      file_id = URI.unescape file_id
      file_id = file_id.tidy_bytes unless file_id == ""
      if file_id.nil?
        puts "Malformed file url, ignoring: #{file_url}"
      elsif @timestamp == 0 or file_timestamp <= @timestamp
        if file_list_curated[file_id]
          unless file_list_curated[file_id][:timestamp] > file_timestamp
            file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
          end
        else
          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
        end
      end
    end
  end
  file_list_curated
end

#structure_dir_path(dir_path) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/wayback_machine_downloader.rb', line 108

def structure_dir_path dir_path
  begin
    FileUtils::mkdir_p dir_path unless File.exists? dir_path
  rescue Errno::EEXIST => e
    puts "# #{e}"
    file_already_existing = e.to_s.split("File exists @ dir_s_mkdir - ")[-1]
    file_already_existing_temporary = file_already_existing + '.temp'
    file_already_existing_permanent = file_already_existing + '/index.html'
    FileUtils::mv file_already_existing, file_already_existing_temporary
    FileUtils::mkdir_p file_already_existing
    FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
    puts "#{file_already_existing} -> #{file_already_existing_permanent}"
    structure_dir_path dir_path
  end
end