Class: WasapiClient
- Inherits:
-
Object
- Object
- WasapiClient
- Defined in:
- lib/wasapi_client.rb,
lib/wasapi_client/version.rb
Overview
Client for interacting with the Archive-It WASAPI APIs
Constant Summary collapse
- NUM_RETRIES =
5- VERSION =
'0.2.0'
Instance Attribute Summary collapse
-
#base_url ⇒ Object
Returns the value of attribute base_url.
-
#password ⇒ Object
Returns the value of attribute password.
-
#username ⇒ Object
Returns the value of attribute username.
Instance Method Summary collapse
-
#connection(url) ⇒ Object
Set up an authenticated GET request for the account.
- #default_storage_url ⇒ Object
- #default_url ⇒ Object
-
#fetch_file(file:, output_dir:, base_url: default_storage_url) ⇒ String?
Fetch a specific file from the WASAPI storage location.
-
#fetch_warcs(collection:, output_dir:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Object
Send a GET request for the URLs for WARCs and download files.
-
#filenames(collection:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Array<String>
Send a GET request for WARCs filenames.
-
#get_locations(collection:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Array<Hash>
Send a GET request for the URLs for WARCs.
-
#initialize(username:, password:, base_url: nil) ⇒ WasapiClient
constructor
A new instance of WasapiClient.
Constructor Details
#initialize(username:, password:, base_url: nil) ⇒ WasapiClient
Returns a new instance of WasapiClient.
18 19 20 21 22 |
# File 'lib/wasapi_client.rb', line 18 def initialize(username:, password:, base_url: nil) @username = username @password = password @base_url = base_url end |
Instance Attribute Details
#base_url ⇒ Object
Returns the value of attribute base_url.
26 27 28 |
# File 'lib/wasapi_client.rb', line 26 def base_url @base_url end |
#password ⇒ Object
Returns the value of attribute password.
26 27 28 |
# File 'lib/wasapi_client.rb', line 26 def password @password end |
#username ⇒ Object
Returns the value of attribute username.
26 27 28 |
# File 'lib/wasapi_client.rb', line 26 def username @username end |
Instance Method Details
#connection(url) ⇒ Object
Set up an authenticated GET request for the account
37 38 39 40 41 42 43 44 |
# File 'lib/wasapi_client.rb', line 37 def connection(url) Faraday.new(url:) do |conn| conn.use Faraday::Response::RaiseError conn.request :authorization, :basic, username, password conn.request :retry, max: 3, interval: 0.05, backoff_factor: 2 conn.response :follow_redirects end end |
#default_storage_url ⇒ Object
32 33 34 |
# File 'lib/wasapi_client.rb', line 32 def default_storage_url 'https://warcs.archive-it.org/webdatafile/' end |
#default_url ⇒ Object
28 29 30 |
# File 'lib/wasapi_client.rb', line 28 def default_url 'https://partner.archive-it.org' end |
#fetch_file(file:, output_dir:, base_url: default_storage_url) ⇒ String?
Fetch a specific file from the WASAPI storage location.
93 94 95 96 97 98 |
# File 'lib/wasapi_client.rb', line 93 def fetch_file(file:, output_dir:, base_url: default_storage_url) # Determine if the input is a URL or a filename file = URI.join(base_url, file).to_s unless file.start_with?('http') download(url: file, output_dir:) end |
#fetch_warcs(collection:, output_dir:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Object
Send a GET request for the URLs for WARCs and download files. Response will be paginated. rubocop:disable Metrics/CyclomaticComplexity
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/wasapi_client.rb', line 52 def fetch_warcs(collection:, output_dir:, crawl_start_after: nil, crawl_start_before: nil) locations = get_locations(collection:, crawl_start_after:, crawl_start_before:) return nil if locations.empty? FileUtils.mkdir_p(output_dir) unless Dir.exist?(output_dir) locations.each do |location| # See if the file already exists and has the correct checksum filepath = File.join(output_dir, File.basename(location[:url])) next if checksum_valid?(filepath:, expected_md5: location[:md5]) retries = 0 until (valid = checksum_valid?(filepath:, expected_md5: location[:md5])) || retries >= NUM_RETRIES fetch_file(file: location[:url], output_dir:) retries += 1 end raise "Failed to fetch a valid file for #{location[:url]} after #{NUM_RETRIES} retries" unless valid end end |
#filenames(collection:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Array<String>
Send a GET request for WARCs filenames.
105 106 107 108 |
# File 'lib/wasapi_client.rb', line 105 def filenames(collection:, crawl_start_after: nil, crawl_start_before: nil) locations = get_locations(collection:, crawl_start_after:, crawl_start_before:) locations.map { |location| File.basename(location[:url]) } end |
#get_locations(collection:, crawl_start_after: nil, crawl_start_before: nil) ⇒ Array<Hash>
Send a GET request for the URLs for WARCs. Response will be paginated.
78 79 80 81 82 83 84 85 86 87 |
# File 'lib/wasapi_client.rb', line 78 def get_locations(collection:, crawl_start_after: nil, crawl_start_before: nil) params = { 'collection': collection, 'crawl-start-after': crawl_start_after, 'crawl-start-before': crawl_start_before } response = query(params:) extract_files(response:, params:) end |