Class: WaybackMachineDownloader
- Inherits:
-
Object
- Object
- WaybackMachineDownloader
show all
- Includes:
- ArchiveAPI
- Defined in:
- lib/wayback_machine_downloader.rb
Constant Summary
collapse
- VERSION =
"2.2.1"
Instance Attribute Summary collapse
Instance Method Summary
collapse
Methods included from ArchiveAPI
#get_raw_list_from_api, #parameters_for_api
Constructor Details
Returns a new instance of WaybackMachineDownloader.
23
24
25
26
27
28
29
30
31
32
33
34
35
|
# File 'lib/wayback_machine_downloader.rb', line 23
def initialize params
@base_url = params[:base_url]
@exact_url = params[:exact_url]
@directory = params[:directory]
@all_timestamps = params[:all_timestamps]
@from_timestamp = params[:from_timestamp].to_i
@to_timestamp = params[:to_timestamp].to_i
@only_filter = params[:only_filter]
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i
end
|
Instance Attribute Details
#all ⇒ Object
Returns the value of attribute all.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def all
@all
end
|
#all_timestamps ⇒ Object
Returns the value of attribute all_timestamps.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def all_timestamps
@all_timestamps
end
|
#base_url ⇒ Object
Returns the value of attribute base_url.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def base_url
@base_url
end
|
#directory ⇒ Object
Returns the value of attribute directory.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def directory
@directory
end
|
#exact_url ⇒ Object
Returns the value of attribute exact_url.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def exact_url
@exact_url
end
|
#exclude_filter ⇒ Object
Returns the value of attribute exclude_filter.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def exclude_filter
@exclude_filter
end
|
#from_timestamp ⇒ Object
Returns the value of attribute from_timestamp.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def from_timestamp
@from_timestamp
end
|
#maximum_pages ⇒ Object
Returns the value of attribute maximum_pages.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def maximum_pages
@maximum_pages
end
|
#only_filter ⇒ Object
Returns the value of attribute only_filter.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def only_filter
@only_filter
end
|
#threads_count ⇒ Object
Returns the value of attribute threads_count.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def threads_count
@threads_count
end
|
#to_timestamp ⇒ Object
Returns the value of attribute to_timestamp.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def to_timestamp
@to_timestamp
end
|
Instance Method Details
#backup_name ⇒ Object
37
38
39
40
41
42
43
|
# File 'lib/wayback_machine_downloader.rb', line 37
def backup_name
if @base_url.include? '//'
@base_url.split('/')[2]
else
@base_url
end
end
|
#backup_path ⇒ Object
45
46
47
48
49
50
51
52
53
54
55
|
# File 'lib/wayback_machine_downloader.rb', line 45
def backup_path
if @directory
if @directory[-1] == '/'
@directory
else
@directory + '/'
end
else
'websites/' + backup_name + '/'
end
end
|
#download_file(file_remote_info) ⇒ Object
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
|
# File 'lib/wayback_machine_downloader.rb', line 246
def download_file file_remote_info
current_encoding = "".encoding
file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id]
file_timestamp = file_remote_info[:timestamp]
file_path_elements = file_id.split('/')
if file_id == ""
dir_path = backup_path
file_path = backup_path + 'index.html'
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
dir_path = backup_path + file_path_elements[0..-1].join('/')
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
else
dir_path = backup_path + file_path_elements[0..-2].join('/')
file_path = backup_path + file_path_elements[0..-1].join('/')
end
if Gem.win_platform?
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
end
unless File.exist? file_path
begin
structure_dir_path dir_path
open(file_path, "wb") do |file|
begin
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
file.write(uri.read)
end
rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}"
if @all
file.write(e.io.read)
puts "#{file_path} saved anyway."
end
rescue StandardError => e
puts "#{file_url} # #{e}"
end
end
rescue StandardError => e
puts "#{file_url} # #{e}"
ensure
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
puts "#{file_path} was empty and was removed."
end
end
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
else
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
end
end
|
#download_files ⇒ Object
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
|
# File 'lib/wayback_machine_downloader.rb', line 187
def download_files
start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
puts
if file_list_by_timestamp.count == 0
puts "No files to download."
puts "Possible reasons:"
puts "\t* Site is not in Wayback Machine Archive."
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
return
end
puts "#{file_list_by_timestamp.count} files to download:"
threads = []
@processed_file_count = 0
@threads_count = 1 unless @threads_count != 0
@threads_count.times do
threads << Thread.new do
until file_queue.empty?
file_remote_info = file_queue.pop(true) rescue nil
download_file(file_remote_info) if file_remote_info
end
end
end
threads.each(&:join)
end_time = Time.now
puts
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
end
|
#file_list_by_timestamp ⇒ Object
308
309
310
|
# File 'lib/wayback_machine_downloader.rb', line 308
def file_list_by_timestamp
@file_list_by_timestamp ||= get_file_list_by_timestamp
end
|
#file_queue ⇒ Object
304
305
306
|
# File 'lib/wayback_machine_downloader.rb', line 304
def file_queue
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end
|
#get_all_snapshots_to_consider ⇒ Object
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# File 'lib/wayback_machine_downloader.rb', line 83
def get_all_snapshots_to_consider
print "Getting snapshot pages"
snapshot_list_to_consider = ""
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
print "."
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list
print "."
end
end
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
puts
snapshot_list_to_consider
end
|
#get_file_list_all_timestamps ⇒ Object
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
# File 'lib/wayback_machine_downloader.rb', line 131
def get_file_list_all_timestamps
file_list_curated = Hash.new
get_all_snapshots_to_consider.each_line do |line|
next unless line.include?('/')
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/')
file_id_and_timestamp = [file_timestamp, file_id].join('/')
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id_and_timestamp]
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
else
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
puts "file_list_curated: " + file_list_curated.count.to_s
file_list_curated
end
|
#get_file_list_by_timestamp ⇒ Object
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
|
# File 'lib/wayback_machine_downloader.rb', line 160
def get_file_list_by_timestamp
if @all_timestamps
file_list_curated = get_file_list_all_timestamps
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
else
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
end
|
#get_file_list_curated ⇒ Object
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
# File 'lib/wayback_machine_downloader.rb', line 103
def get_file_list_curated
file_list_curated = Hash.new
get_all_snapshots_to_consider.each_line do |line|
next unless line.include?('/')
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
file_list_curated
end
|
#list_files ⇒ Object
177
178
179
180
181
182
183
184
185
|
# File 'lib/wayback_machine_downloader.rb', line 177
def list_files
files = get_file_list_by_timestamp
puts "["
files.each do |file|
puts file.to_json + ","
end
puts "]"
end
|
#match_exclude_filter(file_url) ⇒ Object
70
71
72
73
74
75
76
77
78
79
80
81
|
# File 'lib/wayback_machine_downloader.rb', line 70
def match_exclude_filter file_url
if @exclude_filter
exclude_filter_regex = @exclude_filter.to_regex
if exclude_filter_regex
exclude_filter_regex =~ file_url
else
file_url.downcase.include? @exclude_filter.downcase
end
else
false
end
end
|
#match_only_filter(file_url) ⇒ Object
57
58
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/wayback_machine_downloader.rb', line 57
def match_only_filter file_url
if @only_filter
only_filter_regex = @only_filter.to_regex
if only_filter_regex
only_filter_regex =~ file_url
else
file_url.downcase.include? @only_filter.downcase
end
else
true
end
end
|
#semaphore ⇒ Object
312
313
314
|
# File 'lib/wayback_machine_downloader.rb', line 312
def semaphore
@semaphore ||= Mutex.new
end
|
#structure_dir_path(dir_path) ⇒ Object
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
|
# File 'lib/wayback_machine_downloader.rb', line 223
def structure_dir_path dir_path
begin
FileUtils::mkdir_p dir_path unless File.exist? dir_path
rescue Errno::EEXIST => e
error_to_string = e.to_s
puts "# #{error_to_string}"
if error_to_string.include? "File exists @ dir_s_mkdir - "
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
elsif error_to_string.include? "File exists - "
file_already_existing = error_to_string.split("File exists - ")[-1]
else
raise "Unhandled directory restructure error # #{error_to_string}"
end
file_already_existing_temporary = file_already_existing + '.temp'
file_already_existing_permanent = file_already_existing + '/index.html'
FileUtils::mv file_already_existing, file_already_existing_temporary
FileUtils::mkdir_p file_already_existing
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
structure_dir_path dir_path
end
end
|