Top Level Namespace

Defined Under Namespace

Modules: DirectLink

Instance Method Summary collapse

Instance Method Details



312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/directlink.rb', line 312

def DirectLink link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false
  timeout ||= DirectLink.timeout
  ArgumentError.new("link should be a <String>, not <#{link.class}>") unless link.is_a? String
  begin
    URI link
  rescue URI::InvalidURIError
    require "addressable"
    link = Addressable::URI.escape link
  end
  raise DirectLink::ErrorBadLink.new link, true unless URI(link).host

  struct = Module.const_get(__callee__).class_variable_get :@@directlink

  google_without_schema_crutch = lambda do
    if %w{ lh3 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh4 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh5 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh6 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ bp blogspot com } == URI(link).host.split(?.).last(3)
      u = DirectLink.google link
      f = FastImage.new(u, raise_on_failure: true, http_header: {"User-Agent" => "Mozilla"})
      w, h = f.size
      struct.new u, w, h, f.type
    end
  end
  t = google_without_schema_crutch[] and return t

  # to test that we won't hang for too long if someone like aeronautica.difesa.it will be silent for some reason:
  #   $ bundle console
  #   > NetHTTPUtils.logger.level = Logger::DEBUG
  #   > NetHTTPUtils.request_data "http://www.aeronautica.difesa.it/organizzazione/REPARTI/divolo/PublishingImages/6%C2%B0%20Stormo/2013-decollo%20al%20tramonto%20REX%201280.jpg",
  #                               max_read_retry_delay: 5, timeout: 5

  begin
    header = {
      "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
      **( %w{ reddit com } == URI(link).host.split(?.).last(2) ||
          %w{   redd it  } == URI(link).host.split(?.) ? {Cookie: "over18=1"} : {} ),
    }
    head = NetHTTPUtils.request_data link, :HEAD, header: header, **(proxy ? {proxy: proxy} : {}), **(timeout ? {
      timeout: timeout,
      max_start_http_retry_delay: timeout,
      max_read_retry_delay: timeout,
    } : {})
  rescue Net::ReadTimeout, Errno::ETIMEDOUT
  rescue NetHTTPUtils::Error => e
    raise unless 418 == e.code
  else
    raise DirectLink::ErrorAssert.new "last_response.uri is not set" unless head.instance_variable_get(:@last_response).uri
    link = head.instance_variable_get(:@last_response).uri.to_s
  end

  # why do we resolve redirects before trying the known adapters?
  #   because they can be hidden behind URL shorteners
  #   also it can resolve NetHTTPUtils::Error(404) before trying the adapter

  t = google_without_schema_crutch[] and return t   # TODO: why again?

  begin
    imgur = DirectLink.imgur(link, timeout).sort_by{ |u, w, h, t| - w * h }.map do |u, w, h, t|
      struct.new u, w, h, t
    end
    # `DirectLink.imgur` return value is always an Array
    return imgur.size == 1 ? imgur.first : imgur
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ imgur com } == URI(link).host.split(?.).last(2)

  if %w{ 500px com } == URI(link).host.split(?.).last(2)
    w, h, u, t = DirectLink._500px(link)
    return struct.new u, w, h, t
  end

  begin
    w, h, u = DirectLink.flickr(link)
    f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"}
    return struct.new u, w, h, f.type
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ www flickr com } == URI(link).host.split(?.) ||
         %w{     flic kr    } == URI(link).host.split(?.)

  if %w{         wikipedia org } == URI(link).host.split(?.).last(2) ||
     %w{ commons wikimedia org } == URI(link).host.split(?.)
    u = DirectLink.wiki link
    f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"}
    w, h = f.size
    return struct.new u, w, h, f.type
  end

  # TODO protect in two places from eternal recursion

  begin
    s, u = DirectLink.reddit(link)
    unless s
      raise DirectLink::ErrorBadLink.new link if giveup   # TODO: print original url in such cases if there was a recursion
      f = ->_{ _.type == :a ? _.attr["href"] : _.children.flat_map(&f) }
      require "kramdown"
      return f[Kramdown::Document.new(u).root].flat_map do |sublink|
        DirectLink URI.join(link, sublink).to_s, timeout, giveup: giveup   # TODO: maybe subtract from timeout the time we've already wasted
      end
    end
    if u.is_a? Hash
      return struct.new *u.values_at(*%w{ fallback_url width height }), "video"
    elsif u.is_a? Array
      return u.map do |t, x, y, u|
        struct.new u, x, y, t
      end
    end
    raise DirectLink::ErrorNotFound.new link.inspect if link == u
    return DirectLink u, timeout, giveup: giveup
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ reddit com } == URI(link).host.split(?.).last(2) ||
         %w{   redd it  } == URI(link).host.split(?.)

  begin
    return DirectLink.vk(link).map do |w, h, u|
      struct.new u, w, h
    end
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ vk com } == URI(link).host.split(?.)

  begin
    f = FastImage.new link,
      raise_on_failure: true,
      timeout: timeout,
      **(proxy ? {proxy: "http://#{proxy}"} : {}),
      http_header: {"User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
  rescue FastImage::UnknownImageType
    raise if giveup
    require "nokogiri"
    head = NetHTTPUtils.request_data link, :HEAD, header: {"User-Agent" => "Mozilla"},
      max_start_http_retry_delay: timeout,
      timeout: timeout,                 # NetHTTPUtild passes this as read_timeout to Net::HTTP.start
      max_read_retry_delay: timeout     # and then compares accumulated delay to this
    # if we use :get here we will download megabytes of files just to giveup on content_type we can't process
    case head.instance_variable_get(:@last_response).content_type   # webmock should provide this
    when "text/html" ; nil
    else ; raise
    end
    html = Nokogiri::HTML NetHTTPUtils.request_data link, :GET, header: {"User-Agent" => "Mozilla"}
    if t = html.at_css("meta[@property='og:image']")
      begin
        return DirectLink URI.join(link, t[:content]).to_s, nil, *proxy, giveup: true
      rescue URI::InvalidURIError
      end
    end unless ignore_meta
    h = {}  # TODO: maybe move it outside because of possible img[:src] recursion?...
    l = lambda do |node, s = []|
      node.element_children.flat_map do |child|
        next l[child, s + [child.node_name]] unless "img" == child.node_name
        begin
          [[s, (h[child[:src]] = h[child[:src]] || DirectLink(URI.join(link, child[:src]).to_s, nil, giveup: true))]]  # ... or wait, do we giveup?
        rescue => e
          DirectLink.logger.error "#{e} (from no giveup)"
          []
        end
      end
    end
    l[html].
      tap{ |results| raise if results.empty? }.
      group_by(&:first).map{ |k, v| [k.join(?>), v.map(&:last)] }.
      max_by{ |_, v| v.map{ |i| i.width * i.height }.inject(:+) }.last
  else
    # TODO: maybe move this to right before `rescue` line
    w, h = f.size
    struct.new f.instance_variable_get(:@parsed_uri).to_s, w, h, f.type
  end
end