Class: WaybackMachineDownloader
- Inherits:
-
Object
- Object
- WaybackMachineDownloader
show all
- Includes:
- ArchiveAPI
- Defined in:
- lib/wayback_machine_downloader.rb
Constant Summary
collapse
- VERSION =
"2.1.1"
Instance Attribute Summary collapse
Instance Method Summary
collapse
Methods included from ArchiveAPI
#get_raw_list_from_api, #parameters_for_api
Constructor Details
Returns a new instance of WaybackMachineDownloader.
23
24
25
26
27
28
29
30
31
32
33
34
|
# File 'lib/wayback_machine_downloader.rb', line 23
def initialize params
@base_url = params[:base_url]
@exact_url = params[:exact_url]
@directory = params[:directory]
@from_timestamp = params[:from_timestamp].to_i
@to_timestamp = params[:to_timestamp].to_i
@only_filter = params[:only_filter]
@exclude_filter = params[:exclude_filter]
@all = params[:all]
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
@threads_count = params[:threads_count].to_i
end
|
Instance Attribute Details
#all ⇒ Object
Returns the value of attribute all.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def all
@all
end
|
#base_url ⇒ Object
Returns the value of attribute base_url.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def base_url
@base_url
end
|
#directory ⇒ Object
Returns the value of attribute directory.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def directory
@directory
end
|
#exact_url ⇒ Object
Returns the value of attribute exact_url.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def exact_url
@exact_url
end
|
#exclude_filter ⇒ Object
Returns the value of attribute exclude_filter.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def exclude_filter
@exclude_filter
end
|
#from_timestamp ⇒ Object
Returns the value of attribute from_timestamp.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def from_timestamp
@from_timestamp
end
|
#maximum_pages ⇒ Object
Returns the value of attribute maximum_pages.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def maximum_pages
@maximum_pages
end
|
#only_filter ⇒ Object
Returns the value of attribute only_filter.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def only_filter
@only_filter
end
|
#threads_count ⇒ Object
Returns the value of attribute threads_count.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def threads_count
@threads_count
end
|
#to_timestamp ⇒ Object
Returns the value of attribute to_timestamp.
19
20
21
|
# File 'lib/wayback_machine_downloader.rb', line 19
def to_timestamp
@to_timestamp
end
|
Instance Method Details
#backup_name ⇒ Object
36
37
38
39
40
41
42
|
# File 'lib/wayback_machine_downloader.rb', line 36
def backup_name
if @base_url.include? '//'
@base_url.split('/')[2]
else
@base_url
end
end
|
#backup_path ⇒ Object
44
45
46
47
48
49
50
51
52
53
54
|
# File 'lib/wayback_machine_downloader.rb', line 44
def backup_path
if @directory
if @directory[-1] == '/'
@directory
else
@directory + '/'
end
else
'websites/' + backup_name + '/'
end
end
|
#download_file(file_remote_info) ⇒ Object
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
|
# File 'lib/wayback_machine_downloader.rb', line 208
def download_file file_remote_info
current_encoding = "".encoding
file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id]
file_timestamp = file_remote_info[:timestamp]
file_path_elements = file_id.split('/')
if file_id == ""
dir_path = backup_path
file_path = backup_path + 'index.html'
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
dir_path = backup_path + file_path_elements[0..-1].join('/')
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
else
dir_path = backup_path + file_path_elements[0..-2].join('/')
file_path = backup_path + file_path_elements[0..-1].join('/')
end
if Gem.win_platform?
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
end
unless File.exist? file_path
begin
structure_dir_path dir_path
open(file_path, "wb") do |file|
begin
open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
file.write(uri.read)
end
rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}"
if @all
file.write(e.io.read)
puts "#{file_path} saved anyway."
end
rescue StandardError => e
puts "#{file_url} # #{e}"
end
end
rescue StandardError => e
puts "#{file_url} # #{e}"
ensure
if not @all and File.exist?(file_path) and File.size(file_path) == 0
File.delete(file_path)
puts "#{file_path} was empty and was removed."
end
end
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
else
semaphore.synchronize do
@processed_file_count += 1
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
end
end
end
|
#download_files ⇒ Object
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
# File 'lib/wayback_machine_downloader.rb', line 149
def download_files
start_time = Time.now
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
puts
if file_list_by_timestamp.count == 0
puts "No files to download."
puts "Possible reasons:"
puts "\t* Site is not in Wayback Machine Archive."
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
return
end
puts "#{file_list_by_timestamp.count} files to download:"
threads = []
@processed_file_count = 0
@threads_count = 1 unless @threads_count != 0
@threads_count.times do
threads << Thread.new do
until file_queue.empty?
file_remote_info = file_queue.pop(true) rescue nil
download_file(file_remote_info) if file_remote_info
end
end
end
threads.each(&:join)
end_time = Time.now
puts
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
end
|
#file_list_by_timestamp ⇒ Object
269
270
271
|
# File 'lib/wayback_machine_downloader.rb', line 269
def file_list_by_timestamp
@file_list_by_timestamp ||= get_file_list_by_timestamp
end
|
#file_queue ⇒ Object
265
266
267
|
# File 'lib/wayback_machine_downloader.rb', line 265
def file_queue
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end
|
#get_all_snapshots_to_consider ⇒ Object
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
# File 'lib/wayback_machine_downloader.rb', line 82
def get_all_snapshots_to_consider
print "Getting snapshot pages"
snapshot_list_to_consider = ""
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
print "."
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list
print "."
end
end
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
puts
snapshot_list_to_consider
end
|
#get_file_list_by_timestamp ⇒ Object
130
131
132
133
134
135
136
137
|
# File 'lib/wayback_machine_downloader.rb', line 130
def get_file_list_by_timestamp
file_list_curated = get_file_list_curated
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
file_list_curated.map do |file_remote_info|
file_remote_info[1][:file_id] = file_remote_info[0]
file_remote_info[1]
end
end
|
#get_file_list_curated ⇒ Object
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
# File 'lib/wayback_machine_downloader.rb', line 102
def get_file_list_curated
file_list_curated = Hash.new
get_all_snapshots_to_consider.each_line do |line|
next unless line.include?('/')
file_timestamp = line[0..13].to_i
file_url = line[15..-2]
file_id = file_url.split('/')[3..-1].join('/')
file_id = CGI::unescape file_id
file_id = file_id.tidy_bytes unless file_id == ""
if file_id.nil?
puts "Malformed file url, ignoring: #{file_url}"
else
if match_exclude_filter(file_url)
puts "File url matches exclude filter, ignoring: #{file_url}"
elsif not match_only_filter(file_url)
puts "File url doesn't match only filter, ignoring: #{file_url}"
elsif file_list_curated[file_id]
unless file_list_curated[file_id][:timestamp] > file_timestamp
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
else
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
end
end
end
file_list_curated
end
|
#list_files ⇒ Object
139
140
141
142
143
144
145
146
147
|
# File 'lib/wayback_machine_downloader.rb', line 139
def list_files
files = get_file_list_by_timestamp
puts "["
files.each do |file|
puts file.to_json + ","
end
puts "]"
end
|
#match_exclude_filter(file_url) ⇒ Object
69
70
71
72
73
74
75
76
77
78
79
80
|
# File 'lib/wayback_machine_downloader.rb', line 69
def match_exclude_filter file_url
if @exclude_filter
exclude_filter_regex = @exclude_filter.to_regex
if exclude_filter_regex
exclude_filter_regex =~ file_url
else
file_url.downcase.include? @exclude_filter.downcase
end
else
false
end
end
|
#match_only_filter(file_url) ⇒ Object
56
57
58
59
60
61
62
63
64
65
66
67
|
# File 'lib/wayback_machine_downloader.rb', line 56
def match_only_filter file_url
if @only_filter
only_filter_regex = @only_filter.to_regex
if only_filter_regex
only_filter_regex =~ file_url
else
file_url.downcase.include? @only_filter.downcase
end
else
true
end
end
|
#semaphore ⇒ Object
273
274
275
|
# File 'lib/wayback_machine_downloader.rb', line 273
def semaphore
@semaphore ||= Mutex.new
end
|
#structure_dir_path(dir_path) ⇒ Object
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
# File 'lib/wayback_machine_downloader.rb', line 185
def structure_dir_path dir_path
begin
FileUtils::mkdir_p dir_path unless File.exist? dir_path
rescue Errno::EEXIST => e
error_to_string = e.to_s
puts "# #{error_to_string}"
if error_to_string.include? "File exists @ dir_s_mkdir - "
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
elsif error_to_string.include? "File exists - "
file_already_existing = error_to_string.split("File exists - ")[-1]
else
raise "Unhandled directory restructure error # #{error_to_string}"
end
file_already_existing_temporary = file_already_existing + '.temp'
file_already_existing_permanent = file_already_existing + '/index.html'
FileUtils::mv file_already_existing, file_already_existing_temporary
FileUtils::mkdir_p file_already_existing
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
structure_dir_path dir_path
end
end
|