Class: WaybackMachineDownloader

Inherits:
Object
  • Object
show all
Includes:
ArchiveAPI
Defined in:
lib/wayback_machine_downloader.rb

Constant Summary collapse

VERSION =
"2.1.1"

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from ArchiveAPI

#get_raw_list_from_api, #parameters_for_api

Constructor Details

#initialize(params) ⇒ WaybackMachineDownloader

Returns a new instance of WaybackMachineDownloader.



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/wayback_machine_downloader.rb', line 23

def initialize params
  @base_url = params[:base_url]
  @exact_url = params[:exact_url]
  @directory = params[:directory]
  @from_timestamp = params[:from_timestamp].to_i
  @to_timestamp = params[:to_timestamp].to_i
  @only_filter = params[:only_filter]
  @exclude_filter = params[:exclude_filter]
  @all = params[:all]
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
  @threads_count = params[:threads_count].to_i
end

Instance Attribute Details

#allObject

Returns the value of attribute all.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def all
  @all
end

#base_urlObject

Returns the value of attribute base_url.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def base_url
  @base_url
end

#directoryObject

Returns the value of attribute directory.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def directory
  @directory
end

#exact_urlObject

Returns the value of attribute exact_url.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def exact_url
  @exact_url
end

#exclude_filterObject

Returns the value of attribute exclude_filter.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def exclude_filter
  @exclude_filter
end

#from_timestampObject

Returns the value of attribute from_timestamp.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def from_timestamp
  @from_timestamp
end

#maximum_pagesObject

Returns the value of attribute maximum_pages.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def maximum_pages
  @maximum_pages
end

#only_filterObject

Returns the value of attribute only_filter.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def only_filter
  @only_filter
end

#threads_countObject

Returns the value of attribute threads_count.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def threads_count
  @threads_count
end

#to_timestampObject

Returns the value of attribute to_timestamp.



19
20
21
# File 'lib/wayback_machine_downloader.rb', line 19

def to_timestamp
  @to_timestamp
end

Instance Method Details

#backup_nameObject



36
37
38
39
40
41
42
# File 'lib/wayback_machine_downloader.rb', line 36

def backup_name
  if @base_url.include? '//'
    @base_url.split('/')[2]
  else
    @base_url
  end
end

#backup_pathObject



44
45
46
47
48
49
50
51
52
53
54
# File 'lib/wayback_machine_downloader.rb', line 44

def backup_path
  if @directory
    if @directory[-1] == '/'
      @directory
    else
      @directory + '/'
    end
  else
    'websites/' + backup_name + '/'
  end
end

#download_file(file_remote_info) ⇒ Object



208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/wayback_machine_downloader.rb', line 208

def download_file file_remote_info
  current_encoding = "".encoding
  file_url = file_remote_info[:file_url].encode(current_encoding)
  file_id = file_remote_info[:file_id]
  file_timestamp = file_remote_info[:timestamp]
  file_path_elements = file_id.split('/')
  if file_id == ""
    dir_path = backup_path
    file_path = backup_path + 'index.html'
  elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
    dir_path = backup_path + file_path_elements[0..-1].join('/')
    file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
  else
    dir_path = backup_path + file_path_elements[0..-2].join('/')
    file_path = backup_path + file_path_elements[0..-1].join('/')
  end
  if Gem.win_platform?
    file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
  end
  unless File.exist? file_path
    begin
      structure_dir_path dir_path
      open(file_path, "wb") do |file|
        begin
          open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
            file.write(uri.read)
          end
        rescue OpenURI::HTTPError => e
          puts "#{file_url} # #{e}"
          if @all
            file.write(e.io.read)
            puts "#{file_path} saved anyway."
          end
        rescue StandardError => e
          puts "#{file_url} # #{e}"
        end
      end
    rescue StandardError => e
      puts "#{file_url} # #{e}"
    ensure
      if not @all and File.exist?(file_path) and File.size(file_path) == 0
        File.delete(file_path)
        puts "#{file_path} was empty and was removed."
      end
    end
    semaphore.synchronize do
      @processed_file_count += 1
      puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
    end
  else
    semaphore.synchronize do
      @processed_file_count += 1
      puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
    end
  end
end

#download_filesObject



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/wayback_machine_downloader.rb', line 149

def download_files
  start_time = Time.now
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
  puts

  if file_list_by_timestamp.count == 0
    puts "No files to download."
    puts "Possible reasons:"
    puts "\t* Site is not in Wayback Machine Archive."
    puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
    puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
    puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
    puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
    return
  end
 
  puts "#{file_list_by_timestamp.count} files to download:"

  threads = []
  @processed_file_count = 0
  @threads_count = 1 unless @threads_count != 0
  @threads_count.times do
    threads << Thread.new do
      until file_queue.empty?
        file_remote_info = file_queue.pop(true) rescue nil
        download_file(file_remote_info) if file_remote_info
      end
    end
  end

  threads.each(&:join)
  end_time = Time.now
  puts
  puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
end

#file_list_by_timestampObject



269
270
271
# File 'lib/wayback_machine_downloader.rb', line 269

def file_list_by_timestamp
  @file_list_by_timestamp ||= get_file_list_by_timestamp
end

#file_queueObject



265
266
267
# File 'lib/wayback_machine_downloader.rb', line 265

def file_queue
  @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end

#get_all_snapshots_to_considerObject



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/wayback_machine_downloader.rb', line 82

def get_all_snapshots_to_consider
  # Note: Passing a page index parameter allow us to get more snapshots,
  # but from a less fresh index
  print "Getting snapshot pages"
  snapshot_list_to_consider = ""
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
  print "."
  unless @exact_url
    @maximum_pages.times do |page_index|
      snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
      break if snapshot_list.empty?
      snapshot_list_to_consider += snapshot_list
      print "."
    end
  end
  puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
  puts
  snapshot_list_to_consider
end

#get_file_list_by_timestampObject



130
131
132
133
134
135
136
137
# File 'lib/wayback_machine_downloader.rb', line 130

def get_file_list_by_timestamp
  file_list_curated = get_file_list_curated
  file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
  file_list_curated.map do |file_remote_info|
    file_remote_info[1][:file_id] = file_remote_info[0]
    file_remote_info[1]
  end
end

#get_file_list_curatedObject



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/wayback_machine_downloader.rb', line 102

def get_file_list_curated
  file_list_curated = Hash.new
  get_all_snapshots_to_consider.each_line do |line|
    next unless line.include?('/')
    file_timestamp = line[0..13].to_i
    file_url = line[15..-2]
    file_id = file_url.split('/')[3..-1].join('/')
    file_id = CGI::unescape file_id 
    file_id = file_id.tidy_bytes unless file_id == ""
    if file_id.nil?
      puts "Malformed file url, ignoring: #{file_url}"
    else
      if match_exclude_filter(file_url)
        puts "File url matches exclude filter, ignoring: #{file_url}"
      elsif not match_only_filter(file_url)
        puts "File url doesn't match only filter, ignoring: #{file_url}"
      elsif file_list_curated[file_id]
        unless file_list_curated[file_id][:timestamp] > file_timestamp
          file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
        end
      else
        file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
      end
    end
  end
  file_list_curated
end

#list_filesObject



139
140
141
142
143
144
145
146
147
# File 'lib/wayback_machine_downloader.rb', line 139

def list_files
  # retrieval produces its own output
  files = get_file_list_by_timestamp
  puts "["
  files.each do |file|
    puts file.to_json + ","
  end
  puts "]"
end

#match_exclude_filter(file_url) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/wayback_machine_downloader.rb', line 69

def match_exclude_filter file_url
  if @exclude_filter
    exclude_filter_regex = @exclude_filter.to_regex
    if exclude_filter_regex
      exclude_filter_regex =~ file_url
    else
      file_url.downcase.include? @exclude_filter.downcase
    end
  else
    false
  end
end

#match_only_filter(file_url) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/wayback_machine_downloader.rb', line 56

def match_only_filter file_url
  if @only_filter
    only_filter_regex = @only_filter.to_regex
    if only_filter_regex
      only_filter_regex =~ file_url
    else
      file_url.downcase.include? @only_filter.downcase
    end
  else
    true
  end
end

#semaphoreObject



273
274
275
# File 'lib/wayback_machine_downloader.rb', line 273

def semaphore
  @semaphore ||= Mutex.new
end

#structure_dir_path(dir_path) ⇒ Object



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/wayback_machine_downloader.rb', line 185

def structure_dir_path dir_path
  begin
    FileUtils::mkdir_p dir_path unless File.exist? dir_path
  rescue Errno::EEXIST => e
    error_to_string = e.to_s
    puts "# #{error_to_string}"
    if error_to_string.include? "File exists @ dir_s_mkdir - "
      file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
    elsif error_to_string.include? "File exists - "
      file_already_existing = error_to_string.split("File exists - ")[-1]
    else
      raise "Unhandled directory restructure error # #{error_to_string}"
    end
    file_already_existing_temporary = file_already_existing + '.temp'
    file_already_existing_permanent = file_already_existing + '/index.html'
    FileUtils::mv file_already_existing, file_already_existing_temporary
    FileUtils::mkdir_p file_already_existing
    FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
    puts "#{file_already_existing} -> #{file_already_existing_permanent}"
    structure_dir_path dir_path
  end
end