Class: WaybackMachineDownloader

Inherits:
Object
  • Object
show all
Includes:
ArchiveAPI
Defined in:
lib/wayback_machine_downloader.rb

Constant Summary collapse

VERSION =
"2.2.1"

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from ArchiveAPI

#get_raw_list_from_api, #parameters_for_api

Constructor Details

#initialize(params) ⇒ WaybackMachineDownloader

Returns a new instance of WaybackMachineDownloader.



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/wayback_machine_downloader.rb', line 23

def initialize params
  @base_url = params[:base_url]
  @exact_url = params[:exact_url]
  @directory = params[:directory]
  @all_timestamps = params[:all_timestamps]
  @from_timestamp = params[:from_timestamp].to_i
  @to_timestamp = params[:to_timestamp].to_i
  @only_filter = params[:only_filter]
  @exclude_filter = params[:exclude_filter]
  @all = params[:all]
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
  @threads_count = params[:threads_count].to_i
end

Instance Attribute Details

#allObject

Returns the value of attribute all.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def all
  @all
end

#all_timestampsObject

Returns the value of attribute all_timestamps.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def all_timestamps
  @all_timestamps
end

#base_urlObject

Returns the value of attribute base_url.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def base_url
  @base_url
end

#directoryObject

Returns the value of attribute directory.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def directory
  @directory
end

#exact_urlObject

Returns the value of attribute exact_url.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def exact_url
  @exact_url
end

#exclude_filterObject

Returns the value of attribute exclude_filter.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def exclude_filter
  @exclude_filter
end

#from_timestampObject

Returns the value of attribute from_timestamp.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def from_timestamp
  @from_timestamp
end

#maximum_pagesObject

Returns the value of attribute maximum_pages.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def maximum_pages
  @maximum_pages
end

#only_filterObject

Returns the value of attribute only_filter.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def only_filter
  @only_filter
end

#threads_countObject

Returns the value of attribute threads_count.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def threads_count
  @threads_count
end

#to_timestampObject

Returns the value of attribute to_timestamp.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def to_timestamp
  @to_timestamp
end

Instance Method Details

#backup_nameObject



37
38
39
40
41
42
43
# File 'lib/wayback_machine_downloader.rb', line 37

def backup_name
  if @base_url.include? '//'
    @base_url.split('/')[2]
  else
    @base_url
  end
end

#backup_pathObject



45
46
47
48
49
50
51
52
53
54
55
# File 'lib/wayback_machine_downloader.rb', line 45

def backup_path
  if @directory
    if @directory[-1] == '/'
      @directory
    else
      @directory + '/'
    end
  else
    'websites/' + backup_name + '/'
  end
end

#download_file(file_remote_info) ⇒ Object



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/wayback_machine_downloader.rb', line 246

def download_file file_remote_info
  current_encoding = "".encoding
  file_url = file_remote_info[:file_url].encode(current_encoding)
  file_id = file_remote_info[:file_id]
  file_timestamp = file_remote_info[:timestamp]
  file_path_elements = file_id.split('/')
  if file_id == ""
    dir_path = backup_path
    file_path = backup_path + 'index.html'
  elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
    dir_path = backup_path + file_path_elements[0..-1].join('/')
    file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
  else
    dir_path = backup_path + file_path_elements[0..-2].join('/')
    file_path = backup_path + file_path_elements[0..-1].join('/')
  end
  if Gem.win_platform?
    dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
    file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
  end
  unless File.exist? file_path
    begin
      structure_dir_path dir_path
      open(file_path, "wb") do |file|
        begin
          open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
            file.write(uri.read)
          end
        rescue OpenURI::HTTPError => e
          puts "#{file_url} # #{e}"
          if @all
            file.write(e.io.read)
            puts "#{file_path} saved anyway."
          end
        rescue StandardError => e
          puts "#{file_url} # #{e}"
        end
      end
    rescue StandardError => e
      puts "#{file_url} # #{e}"
    ensure
      if not @all and File.exist?(file_path) and File.size(file_path) == 0
        File.delete(file_path)
        puts "#{file_path} was empty and was removed."
      end
    end
    semaphore.synchronize do
      @processed_file_count += 1
      puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
    end
  else
    semaphore.synchronize do
      @processed_file_count += 1
      puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
    end
  end
end

#download_filesObject



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/wayback_machine_downloader.rb', line 187

def download_files
  start_time = Time.now
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
  puts

  if file_list_by_timestamp.count == 0
    puts "No files to download."
    puts "Possible reasons:"
    puts "\t* Site is not in Wayback Machine Archive."
    puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
    puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
    puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
    puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
    return
  end
 
  puts "#{file_list_by_timestamp.count} files to download:"

  threads = []
  @processed_file_count = 0
  @threads_count = 1 unless @threads_count != 0
  @threads_count.times do
    threads << Thread.new do
      until file_queue.empty?
        file_remote_info = file_queue.pop(true) rescue nil
        download_file(file_remote_info) if file_remote_info
      end
    end
  end

  threads.each(&:join)
  end_time = Time.now
  puts
  puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
end

#file_list_by_timestampObject



308
309
310
# File 'lib/wayback_machine_downloader.rb', line 308

def file_list_by_timestamp
  @file_list_by_timestamp ||= get_file_list_by_timestamp
end

#file_queueObject



304
305
306
# File 'lib/wayback_machine_downloader.rb', line 304

def file_queue
  @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end

#get_all_snapshots_to_considerObject



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/wayback_machine_downloader.rb', line 83

def get_all_snapshots_to_consider
  # Note: Passing a page index parameter allow us to get more snapshots,
  # but from a less fresh index
  print "Getting snapshot pages"
  snapshot_list_to_consider = ""
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
  print "."
  unless @exact_url
    @maximum_pages.times do |page_index|
      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
      break if snapshot_list.empty?
      snapshot_list_to_consider += snapshot_list
      print "."
    end
  end
  puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
  puts
  snapshot_list_to_consider
end

#get_file_list_all_timestampsObject



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/wayback_machine_downloader.rb', line 131

def get_file_list_all_timestamps
  file_list_curated = Hash.new
  get_all_snapshots_to_consider.each_line do |line|
    next unless line.include?('/')
    file_timestamp = line[0..13].to_i
    file_url = line[15..-2]
    file_id = file_url.split('/')[3..-1].join('/')
    file_id_and_timestamp = [file_timestamp, file_id].join('/')
    file_id_and_timestamp = CGI::unescape file_id_and_timestamp 
    file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
    if file_id.nil?
      puts "Malformed file url, ignoring: #{file_url}"
    else
      if match_exclude_filter(file_url)
        puts "File url matches exclude filter, ignoring: #{file_url}"
      elsif not match_only_filter(file_url)
        puts "File url doesn't match only filter, ignoring: #{file_url}"
      elsif file_list_curated[file_id_and_timestamp]
        puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
      else
        file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
      end
    end
  end
  puts "file_list_curated: " + file_list_curated.count.to_s
  file_list_curated
end

#get_file_list_by_timestampObject



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/wayback_machine_downloader.rb', line 160

def get_file_list_by_timestamp
  if @all_timestamps
    file_list_curated = get_file_list_all_timestamps
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  else
    file_list_curated = get_file_list_curated
    file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  end
end

#get_file_list_curatedObject



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/wayback_machine_downloader.rb', line 103

def get_file_list_curated
  file_list_curated = Hash.new
  get_all_snapshots_to_consider.each_line do |line|
    next unless line.include?('/')
    file_timestamp = line[0..13].to_i
    file_url = line[15..-2]
    file_id = file_url.split('/')[3..-1].join('/')
    file_id = CGI::unescape file_id 
    file_id = file_id.tidy_bytes unless file_id == ""
    if file_id.nil?
      puts "Malformed file url, ignoring: #{file_url}"
    else
      if match_exclude_filter(file_url)
        puts "File url matches exclude filter, ignoring: #{file_url}"
      elsif not match_only_filter(file_url)
        puts "File url doesn't match only filter, ignoring: #{file_url}"
      elsif file_list_curated[file_id]
        unless file_list_curated[file_id][:timestamp] > file_timestamp
          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
        end
      else
        file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
      end
    end
  end
  file_list_curated
end

#list_filesObject



177
178
179
180
181
182
183
184
185
# File 'lib/wayback_machine_downloader.rb', line 177

def list_files
  # retrieval produces its own output
  files = get_file_list_by_timestamp
  puts "["
  files.each do |file|
    puts file.to_json + ","
  end
  puts "]"
end

#match_exclude_filter(file_url) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/wayback_machine_downloader.rb', line 70

def match_exclude_filter file_url
  if @exclude_filter
    exclude_filter_regex = @exclude_filter.to_regex
    if exclude_filter_regex
      exclude_filter_regex =~ file_url
    else
      file_url.downcase.include? @exclude_filter.downcase
    end
  else
    false
  end
end

#match_only_filter(file_url) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/wayback_machine_downloader.rb', line 57

def match_only_filter file_url
  if @only_filter
    only_filter_regex = @only_filter.to_regex
    if only_filter_regex
      only_filter_regex =~ file_url
    else
      file_url.downcase.include? @only_filter.downcase
    end
  else
    true
  end
end

#semaphoreObject



312
313
314
# File 'lib/wayback_machine_downloader.rb', line 312

def semaphore
  @semaphore ||= Mutex.new
end

#structure_dir_path(dir_path) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/wayback_machine_downloader.rb', line 223

def structure_dir_path dir_path
  begin
    FileUtils::mkdir_p dir_path unless File.exist? dir_path
  rescue Errno::EEXIST => e
    error_to_string = e.to_s
    puts "# #{error_to_string}"
    if error_to_string.include? "File exists @ dir_s_mkdir - "
      file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
    elsif error_to_string.include? "File exists - "
      file_already_existing = error_to_string.split("File exists - ")[-1]
    else
      raise "Unhandled directory restructure error # #{error_to_string}"
    end
    file_already_existing_temporary = file_already_existing + '.temp'
    file_already_existing_permanent = file_already_existing + '/index.html'
    FileUtils::mv file_already_existing, file_already_existing_temporary
    FileUtils::mkdir_p file_already_existing
    FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
    puts "#{file_already_existing} -> #{file_already_existing_permanent}"
    structure_dir_path dir_path
  end
end