Module: WebArchive

Defined in:
lib/webarchive.rb,
lib/webarchive/version.rb

Overview

classes and functions of webarchive package

Defined Under Namespace

Classes: ArchiveQueue, Completer

Constant Summary collapse

HISTORY_FILE =
'~/.webarchive_history'
VERSION =
'0.1.2'

Class Method Summary collapse

Class Method Details

.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true, history: true) ⇒ Object



150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/webarchive.rb', line 150

def self.launch(wait_secs: 1, debug: false, verbose: false, redirect: false, canonical_uri: true, history: true)
  verbose = true if debug
  Thread.abort_on_exception = true
  completer = nil
  if history
    completer = Completer.new(HISTORY_FILE)
    Readline.completion_proc = completer.to_proc
    Readline.completion_append_character = ''
  end

  queues = []

  queues << ArchiveQueue.new('archive.org', wait_secs) do |uri|
    URI.parse("https://web.archive.org/save/#{uri}").open do |f|
      if f.meta['content-location'] && verbose
        puts "<https://web.archive.org#{f.meta['content-location']}>"
      elsif verbose
        puts f.meta.inspect
      end
    end
  end

  queues << ArchiveQueue.new('megalodon.jp', wait_secs) do |uri|
    agent = Mechanize.new
    page = agent.get('https://megalodon.jp/pc/?' +
                     Addressable::URI.form_encode(url: uri))
    res = agent.submit(page.forms.first)
    if debug
      Tempfile.open("#{self}-#{uri.gsub(/\W/, '_')}") do |f|
        f.puts res.body
      end
    end
    og = res.at('meta[property="og:url"]')
    uri = if og
            og[:content]
          else
            res.links.map(&:href).find(-> { res.uri.to_s }) do |x|
              x =~ %r{megalodon\.jp/[\d-]+/}
            end
          end
    puts "<#{uri}>" if verbose
    agent.shutdown
  end

  queues << ArchiveQueue.new('archive.today', wait_secs) do |uri|
    agent = Mechanize.new
    agent.follow_meta_refresh = true

    page = agent.get('https://archive.today/')
    form = page.form_with(id: 'submiturl')
    form['anyway'] = '1'
    form.field_with(name: 'url').value = uri
    page = agent.submit(form)
    puts "<#{page.uri}>" if verbose
    agent.shutdown
  end

  uri_regexp = URI::DEFAULT_PARSER.make_regexp
  while line = Readline.readline("Q(#{queues.map(&:remaining).inject(:+)})> ", add_hist: true)
    uri = ''
    begin
      uri = to_ascii_uri(line).to_s
    rescue Addressable::URI::InvalidURIError => e
      warn_archive_fail(line.strip, '<>', e.message)
    end
    next if uri == ''

    puts uri if verbose

    if uri !~ uri_regexp
      warn "invalid; skipping '#{uri}'"
      next
    end

    queues.each do |q|
      q << uri
    end
    begin
      with_canonical_uri_and_redirect(uri, canonical_uri, redirect) do |x|
        queues.each do |q|
          q << x
        end
      end
    rescue StandardError => e
      warn "skipping canonical/redirect for #{uri}: #{e.message}"
    end

    completer.append_to_history(uri) if completer
  end

  queues.each(&:done_sending)
  # TODO: trap INT and ask for confirmation
end

.my_normalize(str) ⇒ Object



71
72
73
74
75
76
77
# File 'lib/webarchive.rb', line 71

def self.my_normalize(str)
  if str =~ /[^[:ascii:]]/
    Addressable::URI.encode(str)
  else
    str
  end
end

.to_ascii_uri(str) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/webarchive.rb', line 79

def self.to_ascii_uri(str)
  uri = str.strip
  if %r/\.[a-z]{2,4}(\/|$)/.match(uri) && !uri.include?('://') && !uri.start_with?('http')
    uri = 'http://' + uri
  end

  u = Addressable::URI.parse(uri)
  u.host = SimpleIDN.to_ascii(u.host)
  u.path = my_normalize(u.path)
  u.query = my_normalize(u.query)
  u.fragment = my_normalize(u.fragment)
  u
end

.with_canonical_uri_and_redirect(uri, canonical, redirect) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/webarchive.rb', line 93

def self.with_canonical_uri_and_redirect(uri, canonical, redirect)
  if redirect
    res = Net::HTTP.get_response(URI.parse(uri))
    yield res['location'] if res['location'] &&
                             res['location'] != uri
  end
  if canonical
    agent = Mechanize.new
    page = agent.get(uri)
    yield page.canonical_uri.to_s if page.class == Mechanize::Page &&
                                     page.canonical_uri &&
                                     page.canonical_uri.to_s != uri &&
                                     page.canonical_uri != page.uri
  end
rescue Net::HTTPClientError, Mechanize::ResponseCodeError
  # ignore since it will cause a warning later anyway
end