Module: DirectLink

Defined in:: lib/directlink.rb

Defined Under Namespace

Classes: ErrorAssert, ErrorBadLink, ErrorMissingEnvVar, ErrorNotFound

Constant Summary collapse

NORMAL_EXCEPTIONS =

[
  SocketError,
  Net::OpenTimeout,
  Errno::ECONNRESET,
  Errno::ECONNREFUSED,
  Errno::ETIMEDOUT,   # from FastImage
  NetHTTPUtils::Error,
  NetHTTPUtils::EOFError_from_rbuf_fill,
  FastImage::UnknownImageType,
  FastImage::ImageFetchFailure,
  DirectLink::ErrorNotFound,
  DirectLink::ErrorBadLink,
]

Class Attribute Summary collapse

.logger ⇒ Object

Returns the value of attribute logger.
.reddit_bot ⇒ Object

Returns the value of attribute reddit_bot.
.silent ⇒ Object

Returns the value of attribute silent.
.timeout ⇒ Object

Returns the value of attribute timeout.

Class Method Summary collapse

._500px(link) ⇒ Object
.flickr(link) ⇒ Object
.google(src, width = 0) ⇒ Object
.imgur(link, timeout = 1000) ⇒ Object

TODO make the timeout handling respect the way the Directlink method works with timeouts.
.reddit(link, timeout = 1000) ⇒ Object
.vk(link) ⇒ Object
.wiki(link) ⇒ Object

Class Attribute Details

.logger ⇒ `Object`

Returns the value of attribute logger.



5
6
7

# File 'lib/directlink.rb', line 5

def logger
  @logger
end

.reddit_bot ⇒ `Object`

Returns the value of attribute reddit_bot.



211
212
213

# File 'lib/directlink.rb', line 211

def reddit_bot
  @reddit_bot
end

.silent ⇒ `Object`

Returns the value of attribute silent.



4
5
6

# File 'lib/directlink.rb', line 4

def silent
  @silent
end

.timeout ⇒ `Object`

Returns the value of attribute timeout.



6
7
8

# File 'lib/directlink.rb', line 6

def timeout
  @timeout
end

Class Method Details

._500px(link) ⇒ `Object`

# File 'lib/directlink.rb', line 160

def self._500px link
  raise ErrorBadLink.new link unless %r{\Ahttps://500px\.com/photo/(?<id>[^/]+)/[-[a-zA-Z0-9]%]+\/?\z} =~ link
  require "nokogiri"
  f = lambda do |form|
    JSON.load(NetHTTPUtils.request_data "https://api.500px.com/v1/photos", form: form).fetch("photos").values.first
  end
  w, h = f[{"ids" => id                     }].values_at("width", "height")
  # we need the above request to find the real resolution otherwise the "url" in the next request will be wrong
  u, f = f[{"ids" => id, "image_size[]" => w}].fetch("images").first.values_at("url", "format")
  [w, h, u, f]
end

.flickr(link) ⇒ `Object`

# File 'lib/directlink.rb', line 172

def self.flickr link
  raise ErrorBadLink.new link unless %r{\Ahttps://www\.flickr\.com/photos/[^/]+/(?<id>[^/]+)} =~ link ||
                                     %r{\Ahttps://flic\.kr/p/(?<id>[^/]+)\z} =~ link
  raise ErrorMissingEnvVar.new "define FLICKR_API_KEY env var" unless ENV["FLICKR_API_KEY"]

  flickr = lambda do |id, method|
    JSON.load NetHTTPUtils.request_data "https://api.flickr.com/services/rest/", form: {
      api_key: ENV["FLICKR_API_KEY"],
      format: "json",
      nojsoncallback: 1,
      photo_id: id,
      method: "flickr.photos.#{method}",
    }
  end
  json = flickr.call id, "getSizes"
  raise ErrorNotFound.new link.inspect if json == {"stat"=>"fail", "code"=>1, "message"=>"Photo not found"}
  raise ErrorAssert.new "unhandled API response stat for #{link}: #{json}" unless json["stat"] == "ok"
  json["sizes"]["size"].map do |_|
    w, h, u = _.values_at("width", "height", "source")
    [w.to_i, h.to_i, u]
  end.max_by{ |w, h, u| w * h }
end

.google(src, width = 0) ⇒ `Object`

# File 'lib/directlink.rb', line 54

def self.google src, width = 0
  # this can handle links without schema because it's used for parsing community HTML pages
  case src
  # Google Plus post image
  when /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9_-]{11}\/[WX][a-zA-Z0-9_-]{9}I\/AAAAAAA[a-zA-Z0-9_-]{4}\/[a-zA-Z0-9_-]{33}(?:[gwAQ]?CJoC|CL0B(?:GAs)?)\/)w[1-7]\d\d(?:-d)?-h[1-9]\d\d\d?-n(?:-k-no|-rw|)\/[^\/]+\z/
    "#{$1}s#{width}/"
  when /\A(\/\/lh3\.googleusercontent\.com\/proxy\/[a-zA-Z0-9_-]{66,523}=)(?:w(?:[45]\d\d)-h\d\d\d-[np]|s530-p|s110-p-k)\z/
    "https:#{$1}s#{width}/"
  when /\A(\/\/lh3\.googleusercontent\.com\/[a-zA-Z0-9]{24}_[a-zA-Z]{30}7zGIDTJfkc1YZFX2MhgKnjA=)w530-h398-p\z/
    "https:#{$1}s#{width}/"
  when /\A(\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9-]{11}\/[VW][a-zA-Z0-9_-]{9}I\/AAAAAAA[AC][a-zA-Z0-9]{3}\/[a-zA-Z0-9_-]{32}[gwAQ]CJoC\/)w530-h[23]\d\d-p\/[^\/]+\z/,
       /\A(?:https?:)?(\/\/[1-4]\.bp\.blogspot\.com\/-[a-zA-Z0-9_-]{11}\/[UVWX][a-zA-Z0-9_-]{9}I\/AAAAAAAA[A-Z][a-zA-Z0-9_-]{2}\/[a-zA-Z0-9_-]{33}C(?:EwYBhgL|(?:Lc|Kg)BGAs(?:YHQ)?)\/)(?:s640|w\d{2,4}-h\d\d\d?-p(?:-k-no-nu)?)\/[^\/]+\z/,
       /\A(?:https?:)?(\/\/[1-4]\.bp\.blogspot\.com\/-[a-zA-Z0-9-]{11}\/[UV][a-zA-Z0-9_-]{9}I\/AAAAAAAA[A-Z][a-zA-Z0-9]{2}\/[a-zA-Z0-9-]{11}\/)w72-h72-p-k-no-nu\/[^\/]+\z/
    "https:#{$1}s#{width}/"
  when /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9_]{11}\/AAAAAAAAAAI\/AAAAAAAAAAQ\/[a-zA-Z0-9_]{11}\/)w530-h[13]\d\d-n\/[^\/]+\z/,
       /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9_]{11}\/V[a-zA-Z0-9-]{9}I\/AAAAAAAA[ML][c-q4][so0]\/[a-zA-Z0-9_]{11}\/)w530(?:-d)?-h3\d\d-n\/[^\/]+\z/
    "#{$1}s#{width}/"
  # high res (s0) Google Plus post image
  when /\Ahttps:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9_-]{11}\/[a-zA-Z0-9_-]{10}I\/AAAAAAA[a-zA-Z0-9_-]{4}\/[a-zA-Z0-9_-]{33}CJoC\/s0\/[^\/]+\z/
    src
  # Google Plus userpic
  when /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9-]{11}\/AAAAAAAAAAI\/AAAAAAAA[a-zA-Z0-9]{3}\/[a-zA-Z0-9_-]{11}\/)s\d\d-p(?:-k)?-rw-no\/photo\.jpg\z/
    "#{$1}s#{width}/"
  # Hangout userpic
  when /\A(https:\/\/lh[356]\.googleusercontent\.com\/-[a-zA-Z0-9]{11}\/AAAAAAAAAAI\/AAAAAAAA[a-zA-Z0-9]{3}\/[a-zA-Z0-9-]{11}\/)s\d\d-c-k-no\/photo\.jpg\z/,
       /\A(https:\/\/lh[356]\.googleusercontent\.com\/-[a-zA-Z0-9]{11}\/AAAAAAAAAAI\/AAAAAAAAAAA\/[a-zA-Z0-9]{11}\/)s64-c-k\/photo\.jpg\z/,
       /\A(https:\/\/lh[356]\.googleusercontent\.com\/-[a-zA-Z0-9]{11}\/AAAAAAAAAAI\/AAAAAAAAAAA\/[a-zA-Z0-9_]{34}\/)s(?:46|64)-c(?:-k(?:-no)?)?-mo\/photo\.jpg\z/
    "#{$1}s#{width}/"
  # Google Keep
  when /\A(https:\/\/lh\d\.googleusercontent\.com\/[a-zA-Z0-9_-]{104,106}=s)\d\d\d\d?\z/
    "#{$1}#{width}"
  # opensea
  when /\A(https:\/\/lh3\.googleusercontent\.com\/[a-zA-Z0-9]{78}-nGx_jf_XGqqiVANe_Jr8u2g=)w1400-k\z/
    "#{$1}s#{width}"
  # mp4
  when /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z]{11}\/W[a-zA-Z0-9]{9}I\/AAAAAAAAODw\/[a-zA-Z0-9]{32}QCJoC\/)w530-h883-n-k-no\/[^\/]+\.mp4\z/
    "#{$1}s#{width}/"
  # something else
  when /\A(https:\/\/lh3\.googleusercontent\.com\/-[a-zA-Z0-9_]{11}\/X-[a-zA-Z0-9]{8}I\/AAAAAAAAALE\/[a-zA-Z0-9]{23}_[a-zA-Z0-9]{19}\/)w1200-h630-p-k-no-nu\/[\d-]+\.png\z/
    "#{$1}s#{width}/"
  else
    raise ErrorBadLink.new src
  end
end

.imgur(link, timeout = 1000) ⇒ `Object`

TODO make the timeout handling respect the way the Directlink method works with timeouts

# File 'lib/directlink.rb', line 103

def self.imgur link, timeout = 1000
  raise ErrorMissingEnvVar.new "define IMGUR_CLIENT_ID env var" unless ENV["IMGUR_CLIENT_ID"]

  request_data = lambda do |url|
    t = 1
    begin
      NetHTTPUtils.request_data url, header: { Authorization: "Client-ID #{ENV["IMGUR_CLIENT_ID"]}" }
    rescue NetHTTPUtils::Error => e
      raise ErrorNotFound.new url.inspect if 404 == e.code
      if t < timeout && [400, 500, 502, 503].include?(e.code)
        logger.error "retrying in #{t} seconds because of Imgur HTTP ERROR #{e.code}"
        sleep t
        t *= 2
        retry
      end
      raise ErrorAssert.new "unexpected http error #{e.code} for #{url}"
    end
  end
  case link
  when /\Ahttps?:\/\/(?:(?:i|m|www)\.)?imgur\.com\/(a|gallery)\/([a-zA-Z0-9]{5}(?:[a-zA-Z0-9]{2})?)\z/,
       /\Ahttps?:\/\/imgur\.com\/(gallery)\/([a-zA-Z0-9]{5}(?:[a-zA-Z0-9]{2})?)\/new\z/
    json = request_data["https://api.imgur.com/3/#{$1 == "gallery" ? "gallery" : "album"}/#{$2}/0.json"]
    data = JSON.load(json)["data"]
    if data["error"]
      raise ErrorAssert.new "unexpected error #{data.inspect} for #{link}"
    elsif data["images"]
      raise ErrorNotFound.new link.inspect if data["images"].empty?
      data["images"]
    elsif data["type"] && %w{ image/jpeg image/png image/gif video/mp4 }.include?(data["type"])
      # TODO check if this branch is possible at all
      [ data ]
    # elsif data["comment"]
    #   fi["https://imgur.com/" + data["image_id"]]
    else
      # one day single-video item should hit this but somehow it didn't yet
      raise ErrorAssert.new "unknown data format #{json} for #{link}"
    end
  when /\Ahttps?:\/\/(?:(?:i|m|www)\.)?imgur\.com\/([a-zA-Z0-9]{7,8})(?:\.(?:gifv|jpe?g(?:\?fb)?|png))?\z/,
       /\Ahttps?:\/\/(?:(?:i|m|www)\.)?imgur\.com\/([a-zA-Z0-9]{5})\.mp4\z/,
       /\Ahttps?:\/\/imgur\.com\/([a-zA-Z0-9]{5}(?:[a-zA-Z0-9]{2})?)\z/,
       /\Ahttps?:\/\/imgur\.com\/([a-zA-Z0-9]{7})(?:\?\S+)?\z/,
       /\Ahttps?:\/\/imgur\.com\/r\/[0-9_a-z]+\/([a-zA-Z0-9]{7})\z/,
       /\Ahttps?:\/\/api\.imgur\.com\/3\/image\/([a-zA-Z0-9]{7})\/0\.json\z/
    json = request_data["https://api.imgur.com/3/image/#{$1}/0.json"]
    [ JSON.load(json)["data"] ]
  else
    raise ErrorBadLink.new link
  end.map do |image|
    case image["type"]
    when *%w{ image/jpeg image/png image/gif video/mp4 }
      image.values_at "link", "width", "height", "type"
    else
      raise ErrorAssert.new "unknown type of #{link}: #{image}"
    end
  end
end

.reddit(link, timeout = 1000) ⇒ `Object`

# File 'lib/directlink.rb', line 213

def self.reddit link, timeout = 1000
  return [true, link] if URI(link).host &&
                         URI(link).host.split(?.) == %w{ i redd it } &&
                         URI(link).path[/\A\/[a-z0-9]{12,13}\.(gif|jpg)\z/]
  unless id = link[/\Ahttps:\/\/www\.reddit\.com\/gallery\/([0-9a-z]{5,6})\z/, 1]
    raise DirectLink::ErrorBadLink.new link unless id = URI(link).path[/\A(?:\/r\/[0-9a-zA-Z_]+)?(?:\/comments|\/duplicates)?\/([0-9a-z]{5,6})(?:\/|\z)/, 1]
  end
  retry_on_json_parseerror = lambda do |&b|
    t = 1
    begin
      b.call
    rescue JSON::ParserError => e
      raise ErrorBadLink.new link if t > timeout
      logger.error "#{e.message[0, 500].gsub(/\s+/, " ")}, retrying in #{t} seconds"
      sleep t
      t *= 2
      retry
    end
  end
  json = if ENV["REDDIT_SECRETS"]
    require "reddit_bot"
    RedditBot.logger.level = Logger::ERROR
    require "yaml"
    self.reddit_bot ||= RedditBot::Bot.new YAML.load_file ENV["REDDIT_SECRETS"]
    retry_on_json_parseerror.call{ self.reddit_bot.json :get, "/by_id/t3_#{id}" }
  else
    raise ErrorMissingEnvVar.new "defining REDDIT_SECRETS env var is highly recommended" rescue nil
    json = retry_on_json_parseerror.call{ JSON.load NetHTTPUtils.request_data "https://www.reddit.com/#{id}.json", header: {"User-Agent" => "Mozilla"} }
    raise ErrorAssert.new "our knowledge about Reddit API seems to be outdated" unless json.size == 2
    json.find{ |_| _["data"]["children"].first["kind"] == "t3" }
  end
  # TODO: do we handle linking Imgur albums?
  data = json["data"]["children"].first["data"]
  if data["media"]
    return [true, data["media"]["reddit_video"]["fallback_url"]] if data["media"]["reddit_video"]
    raise ErrorAssert.new "our knowledge about Reddit API seems to be outdated" unless data["media"].keys.sort == %w{ oembed type } && %w{ youtube.com gfycat.com imgur.com }.include?(data["media"]["type"])
    return [true, data["media"]["oembed"]["thumbnail_url"]]
  end
  if data["media_metadata"]
    return [true, data["media_metadata"].values.map do |media|
      next if media == {"status"=>"failed"} || media == {"status"=>"unprocessed"}
      raise ErrorAssert.new "our knowledge about Reddit API seems to be outdated (media == #{media.inspect})" unless media["status"] == "valid"
      [media["m"], *media["s"].values_at("x", "y"), CGI.unescapeHTML(media["s"][media["m"]=="image/gif" ? "gif" : "u"])]
    end.compact]
  end
  return [true, "#{"https://www.reddit.com" if /\A\/r\/[0-9a-zA-Z_]+\/comments\/[0-9a-z]{5,6}\// =~ data["url"]}#{data["url"]}"] if data["crosspost_parent"]
  return [true, CGI.unescapeHTML(data["url"])] unless data["is_self"]
  raise ErrorAssert.new "our knowledge about Reddit API seems to be outdated" if data["url"] != "https://www.reddit.com" + data["permalink"]
  return [false, data["selftext"]]
end

.vk(link) ⇒ `Object`

# File 'lib/directlink.rb', line 264

def self.vk link
  id, mtd, field, f = case link
  when %r{\Ahttps://vk\.com/id(?<user_id>\d+)\?z=photo(?<id>\k<user_id>_\d+)(%2F(album\k<user_id>_0|photos\k<user_id>))?\z},
       %r{\Ahttps://vk\.com/[a-z_]+\?z=photo(?<_>)(?<id>(?<user_id>\d+)_\d+)%2Fphotos\k<user_id>\z},
       %r{\Ahttps://vk\.com/photo(?<_>)(?<id>-?\d+_\d+)(\?(all|rev)=1)?\z},
       %r{\Ahttps://vk\.com/feed\?(?:section=likes&)?z=photo(?<_>)(?<id>(?<user_id>-?\d+)_\d+)%2F(liked\d+|album\k<user_id>_0(0%2Frev)?)\z},
       %r{\Ahttps://vk\.com/[a-z_]+\?z=photo(?<_>)(?<id>(?<user_id>-\d+)_\d+)%2F(wall\k<user_id>_\d+|album\k<user_id>_0)\z},
       %r{\Ahttps://vk\.com/wall(?<user_id>-\d+)_\d+\?z=photo(?<id>\k<user_id>_\d+)%2F(wall\k<user_id>_\d+|album\k<user_id>_00%2Frev|\d+)\z}
    [$2, :photos, :photos, lambda do |t|
      raise ErrorAssert.new "our knowledge about VK API seems to be outdated" unless 1 == t.size
      t
    end ]
  when %r{\Ahttps://vk\.com/wall(?<id>-?\d+_\d+)\z},
       %r{\Ahttps://vk\.com/[a-z\.]+\?w=wall(?<id>-?\d+_\d+)\z}
    [$1, :wall, :posts, lambda do |t|
      t.first.fetch("attachments").select do |item|
        case item.keys
        when %w{ type photo }
          raise ErrorAssert.new "our knowledge about VK API seems to be outdated" unless item["type"] == "photo"
          next true
        when %w{ type audio }
          raise ErrorAssert.new "our knowledge about VK API seems to be outdated" unless item["type"] == "audio"
        else
          raise ErrorAssert.new "our knowledge about VK API seems to be outdated"
        end
      end.map{ |i| i.fetch "photo" }
    end ]
  else
    raise ErrorBadLink.new link
  end
  raise ErrorMissingEnvVar.new "define VK_ACCESS_TOKEN and VK_CLIENT_SECRET env vars" unless ENV["VK_ACCESS_TOKEN"] && ENV["VK_CLIENT_SECRET"]
  sleep 0.25 unless ENV["CI"] # "error_msg"=>"Too many requests per second"
  f.call( JSON.load( NetHTTPUtils.request_data "https://api.vk.com/method/#{mtd}.getById",
    :POST, form: { field => id, :access_token => ENV["VK_ACCESS_TOKEN"], :client_secret => ENV["VK_CLIENT_SECRET"], :v => "5.101" }
  ).fetch("response") ).map do |photos|
    photos.fetch("sizes").map do |size|
      size.values_at("width", "height", "url").tap do |whu|
        w, h, u = whu
        whu[0, 2] = FastImage.new(u, raise_on_failure: true).size if [w, h].include? 0 # wtf?
      end
    end.max_by{ |w, h, u| w * h }
  end
end

.wiki(link) ⇒ `Object`

# File 'lib/directlink.rb', line 196

def self.wiki link
  raise ErrorBadLink.new link unless %r{\Ahttps?://(?<hostname>([a-z]{2}\.wikipedia|commons.wikimedia)\.org)/wiki(/[^/]+)*/(?<id>File:.+)} =~ link
  t = JSON.load json = NetHTTPUtils.request_data( "https://#{hostname}/w/api.php", form: {
    format: "json",
    action: "query",
    prop: "imageinfo",
    iiprop: "url",
    titles: CGI.unescape(id),
  } )
  imageinfo = t["query"]["pages"].values.first["imageinfo"]
  raise ErrorAssert.new "unexpected format of API response about #{link}: #{json}" unless imageinfo
  imageinfo.first["url"]
end

Module: DirectLink

Defined Under Namespace

Constant Summary collapse

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.logger ⇒ Object

.reddit_bot ⇒ Object

.silent ⇒ Object

.timeout ⇒ Object

Class Method Details

._500px(link) ⇒ Object

.flickr(link) ⇒ Object

.google(src, width = 0) ⇒ Object

.imgur(link, timeout = 1000) ⇒ Object

.reddit(link, timeout = 1000) ⇒ Object

.vk(link) ⇒ Object

.wiki(link) ⇒ Object