Top Level Namespace
Defined Under Namespace
Modules: DirectLink
Instance Method Summary collapse
Instance Method Details
#DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false) ⇒ Object
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 |
# File 'lib/directlink.rb', line 312 def DirectLink link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false timeout ||= DirectLink.timeout ArgumentError.new("link should be a <String>, not <#{link.class}>") unless link.is_a? String begin URI link rescue URI::InvalidURIError require "addressable" link = Addressable::URI.escape link end raise DirectLink::ErrorBadLink.new link, true unless URI(link).host struct = Module.const_get(__callee__).class_variable_get :@@directlink google_without_schema_crutch = lambda do if %w{ lh3 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh4 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh5 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh6 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ bp blogspot com } == URI(link).host.split(?.).last(3) u = DirectLink.google link f = FastImage.new(u, raise_on_failure: true, http_header: {"User-Agent" => "Mozilla"}) w, h = f.size struct.new u, w, h, f.type end end t = google_without_schema_crutch[] and return t # to test that we won't hang for too long if someone like aeronautica.difesa.it will be silent for some reason: # $ bundle console # > NetHTTPUtils.logger.level = Logger::DEBUG # > NetHTTPUtils.request_data "http://www.aeronautica.difesa.it/organizzazione/REPARTI/divolo/PublishingImages/6%C2%B0%20Stormo/2013-decollo%20al%20tramonto%20REX%201280.jpg", # max_read_retry_delay: 5, timeout: 5 begin header = { "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", **( %w{ reddit com } == URI(link).host.split(?.).last(2) || %w{ redd it } == URI(link).host.split(?.) ? {Cookie: "over18=1"} : {} ), } head = NetHTTPUtils.request_data link, :HEAD, header: header, **(proxy ? {proxy: proxy} : {}), **(timeout ? { timeout: timeout, max_start_http_retry_delay: timeout, max_read_retry_delay: timeout, } : {}) rescue Net::ReadTimeout, Errno::ETIMEDOUT rescue NetHTTPUtils::Error => e raise unless 418 == e.code else raise DirectLink::ErrorAssert.new "last_response.uri is not set" unless head.instance_variable_get(:@last_response).uri link = head.instance_variable_get(:@last_response).uri.to_s end # why do we resolve redirects before trying the known adapters? # because they can be hidden behind URL shorteners # also it can resolve NetHTTPUtils::Error(404) before trying the adapter t = google_without_schema_crutch[] and return t # TODO: why again? begin imgur = DirectLink.imgur(link, timeout).sort_by{ |u, w, h, t| - w * h }.map do |u, w, h, t| struct.new u, w, h, t end # `DirectLink.imgur` return value is always an Array return imgur.size == 1 ? imgur.first : imgur rescue DirectLink::ErrorMissingEnvVar end if %w{ imgur com } == URI(link).host.split(?.).last(2) if %w{ 500px com } == URI(link).host.split(?.).last(2) w, h, u, t = DirectLink._500px(link) return struct.new u, w, h, t end begin w, h, u = DirectLink.flickr(link) f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"} return struct.new u, w, h, f.type rescue DirectLink::ErrorMissingEnvVar end if %w{ www flickr com } == URI(link).host.split(?.) || %w{ flic kr } == URI(link).host.split(?.) if %w{ wikipedia org } == URI(link).host.split(?.).last(2) || %w{ commons wikimedia org } == URI(link).host.split(?.) u = DirectLink.wiki link f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"} w, h = f.size return struct.new u, w, h, f.type end # TODO protect in two places from eternal recursion begin s, u = DirectLink.reddit(link) unless s raise DirectLink::ErrorBadLink.new link if giveup # TODO: print original url in such cases if there was a recursion f = ->_{ _.type == :a ? _.attr["href"] : _.children.flat_map(&f) } require "kramdown" return f[Kramdown::Document.new(u).root].flat_map do |sublink| DirectLink URI.join(link, sublink).to_s, timeout, giveup: giveup # TODO: maybe subtract from timeout the time we've already wasted end end if u.is_a? Hash return struct.new *u.values_at(*%w{ fallback_url width height }), "video" elsif u.is_a? Array return u.map do |t, x, y, u| struct.new u, x, y, t end end raise DirectLink::ErrorNotFound.new link.inspect if link == u return DirectLink u, timeout, giveup: giveup rescue DirectLink::ErrorMissingEnvVar end if %w{ reddit com } == URI(link).host.split(?.).last(2) || %w{ redd it } == URI(link).host.split(?.) begin return DirectLink.vk(link).map do |w, h, u| struct.new u, w, h end rescue DirectLink::ErrorMissingEnvVar end if %w{ vk com } == URI(link).host.split(?.) begin f = FastImage.new link, raise_on_failure: true, timeout: timeout, **(proxy ? {proxy: "http://#{proxy}"} : {}), http_header: {"User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} rescue FastImage::UnknownImageType raise if giveup require "nokogiri" head = NetHTTPUtils.request_data link, :HEAD, header: {"User-Agent" => "Mozilla"}, max_start_http_retry_delay: timeout, timeout: timeout, # NetHTTPUtild passes this as read_timeout to Net::HTTP.start max_read_retry_delay: timeout # and then compares accumulated delay to this # if we use :get here we will download megabytes of files just to giveup on content_type we can't process case head.instance_variable_get(:@last_response).content_type # webmock should provide this when "text/html" ; nil else ; raise end html = Nokogiri::HTML NetHTTPUtils.request_data link, :GET, header: {"User-Agent" => "Mozilla"} if t = html.at_css("meta[@property='og:image']") begin return DirectLink URI.join(link, t[:content]).to_s, nil, *proxy, giveup: true rescue URI::InvalidURIError end end unless h = {} # TODO: maybe move it outside because of possible img[:src] recursion?... l = lambda do |node, s = []| node.element_children.flat_map do |child| next l[child, s + [child.node_name]] unless "img" == child.node_name begin [[s, (h[child[:src]] = h[child[:src]] || DirectLink(URI.join(link, child[:src]).to_s, nil, giveup: true))]] # ... or wait, do we giveup? rescue => e DirectLink.logger.error "#{e} (from no giveup)" [] end end end l[html]. tap{ |results| raise if results.empty? }. group_by(&:first).map{ |k, v| [k.join(?>), v.map(&:last)] }. max_by{ |_, v| v.map{ |i| i.width * i.height }.inject(:+) }.last else # TODO: maybe move this to right before `rescue` line w, h = f.size struct.new f.instance_variable_get(:@parsed_uri).to_s, w, h, f.type end end |