Top Level Namespace

Defined Under Namespace

Instance Method Summary collapse

#DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false) ⇒ Object
Instance Method Details

#DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false) ⇒ `Object`

# File 'lib/directlink.rb', line 312

def DirectLink link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false
  timeout ||= DirectLink.timeout
  ArgumentError.new("link should be a <String>, not <#{link.class}>") unless link.is_a? String
  begin
    URI link
  rescue URI::InvalidURIError
    require "addressable"
    link = Addressable::URI.escape link
  end
  raise DirectLink::ErrorBadLink.new link, true unless URI(link).host

  struct = Module.const_get(__callee__).class_variable_get :@@directlink

  google_without_schema_crutch = lambda do
    if %w{ lh3 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh4 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh5 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ lh6 googleusercontent com } == URI(link).host.split(?.).last(3) ||
       %w{ bp blogspot com } == URI(link).host.split(?.).last(3)
      u = DirectLink.google link
      f = FastImage.new(u, raise_on_failure: true, http_header: {"User-Agent" => "Mozilla"})
      w, h = f.size
      struct.new u, w, h, f.type
    end
  end
  t = google_without_schema_crutch[] and return t

  # to test that we won't hang for too long if someone like aeronautica.difesa.it will be silent for some reason:
  #   $ bundle console
  #   > NetHTTPUtils.logger.level = Logger::DEBUG
  #   > NetHTTPUtils.request_data "http://www.aeronautica.difesa.it/organizzazione/REPARTI/divolo/PublishingImages/6%C2%B0%20Stormo/2013-decollo%20al%20tramonto%20REX%201280.jpg",
  #                               max_read_retry_delay: 5, timeout: 5

  begin
    header = {
      "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
      **( %w{ reddit com } == URI(link).host.split(?.).last(2) ||
          %w{   redd it  } == URI(link).host.split(?.) ? {Cookie: "over18=1"} : {} ),
    }
    head = NetHTTPUtils.request_data link, :HEAD, header: header, **(proxy ? {proxy: proxy} : {}), **(timeout ? {
      timeout: timeout,
      max_start_http_retry_delay: timeout,
      max_read_retry_delay: timeout,
    } : {})
  rescue Net::ReadTimeout, Errno::ETIMEDOUT
  rescue NetHTTPUtils::Error => e
    raise unless 418 == e.code
  else
    raise DirectLink::ErrorAssert.new "last_response.uri is not set" unless head.instance_variable_get(:@last_response).uri
    link = head.instance_variable_get(:@last_response).uri.to_s
  end

  # why do we resolve redirects before trying the known adapters?
  #   because they can be hidden behind URL shorteners
  #   also it can resolve NetHTTPUtils::Error(404) before trying the adapter

  t = google_without_schema_crutch[] and return t   # TODO: why again?

  begin
    imgur = DirectLink.imgur(link, timeout).sort_by{ |u, w, h, t| - w * h }.map do |u, w, h, t|
      struct.new u, w, h, t
    end
    # `DirectLink.imgur` return value is always an Array
    return imgur.size == 1 ? imgur.first : imgur
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ imgur com } == URI(link).host.split(?.).last(2)

  if %w{ 500px com } == URI(link).host.split(?.).last(2)
    w, h, u, t = DirectLink._500px(link)
    return struct.new u, w, h, t
  end

  begin
    w, h, u = DirectLink.flickr(link)
    f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"}
    return struct.new u, w, h, f.type
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ www flickr com } == URI(link).host.split(?.) ||
         %w{     flic kr    } == URI(link).host.split(?.)

  if %w{         wikipedia org } == URI(link).host.split(?.).last(2) ||
     %w{ commons wikimedia org } == URI(link).host.split(?.)
    u = DirectLink.wiki link
    f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"}
    w, h = f.size
    return struct.new u, w, h, f.type
  end

  # TODO protect in two places from eternal recursion

  begin
    s, u = DirectLink.reddit(link)
    unless s
      raise DirectLink::ErrorBadLink.new link if giveup   # TODO: print original url in such cases if there was a recursion
      f = ->_{ _.type == :a ? _.attr["href"] : _.children.flat_map(&f) }
      require "kramdown"
      return f[Kramdown::Document.new(u).root].flat_map do |sublink|
        DirectLink URI.join(link, sublink).to_s, timeout, giveup: giveup   # TODO: maybe subtract from timeout the time we've already wasted
      end
    end
    if u.is_a? Hash
      return struct.new *u.values_at(*%w{ fallback_url width height }), "video"
    elsif u.is_a? Array
      return u.map do |t, x, y, u|
        struct.new u, x, y, t
      end
    end
    raise DirectLink::ErrorNotFound.new link.inspect if link == u
    return DirectLink u, timeout, giveup: giveup
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ reddit com } == URI(link).host.split(?.).last(2) ||
         %w{   redd it  } == URI(link).host.split(?.)

  begin
    return DirectLink.vk(link).map do |w, h, u|
      struct.new u, w, h
    end
  rescue DirectLink::ErrorMissingEnvVar
  end if %w{ vk com } == URI(link).host.split(?.)

  begin
    f = FastImage.new link,
      raise_on_failure: true,
      timeout: timeout,
      **(proxy ? {proxy: "http://#{proxy}"} : {}),
      http_header: {"User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
  rescue FastImage::UnknownImageType
    raise if giveup
    require "nokogiri"
    head = NetHTTPUtils.request_data link, :HEAD, header: {"User-Agent" => "Mozilla"},
      max_start_http_retry_delay: timeout,
      timeout: timeout,                 # NetHTTPUtild passes this as read_timeout to Net::HTTP.start
      max_read_retry_delay: timeout     # and then compares accumulated delay to this
    # if we use :get here we will download megabytes of files just to giveup on content_type we can't process
    case head.instance_variable_get(:@last_response).content_type   # webmock should provide this
    when "text/html" ; nil
    else ; raise
    end
    html = Nokogiri::HTML NetHTTPUtils.request_data link, :GET, header: {"User-Agent" => "Mozilla"}
    if t = html.at_css("meta[@property='og:image']")
      begin
        return DirectLink URI.join(link, t[:content]).to_s, nil, *proxy, giveup: true
      rescue URI::InvalidURIError
      end
    end unless ignore_meta
    h = {}  # TODO: maybe move it outside because of possible img[:src] recursion?...
    l = lambda do |node, s = []|
      node.element_children.flat_map do |child|
        next l[child, s + [child.node_name]] unless "img" == child.node_name
        begin
          [[s, (h[child[:src]] = h[child[:src]] || DirectLink(URI.join(link, child[:src]).to_s, nil, giveup: true))]]  # ... or wait, do we giveup?
        rescue => e
          DirectLink.logger.error "#{e} (from no giveup)"
          []
        end
      end
    end
    l[html].
      tap{ |results| raise if results.empty? }.
      group_by(&:first).map{ |k, v| [k.join(?>), v.map(&:last)] }.
      max_by{ |_, v| v.map{ |i| i.width * i.height }.inject(:+) }.last
  else
    # TODO: maybe move this to right before `rescue` line
    w, h = f.size
    struct.new f.instance_variable_get(:@parsed_uri).to_s, w, h, f.type
  end
end
Top Level Namespace

Defined Under Namespace

Instance Method Summary collapse

Instance Method Details

#DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false) ⇒ Object

#DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false) ⇒ `Object`