Class: Zorki::Scraper

Inherits:

Object

Object
Zorki::Scraper

show all

Includes:: Capybara::DSL

Defined in:: lib/zorki/scrapers/scraper.rb

Overview

rubocop:disable Metrics/ClassLength

Direct Known Subclasses

PostScraper, UserScraper

Constant Summary collapse

@@logger =

Logger.new(STDOUT)

@@session_id =

nil

Instance Method Summary collapse

#get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil) ⇒ Object

Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually is used to seed the page.
#initialize ⇒ Scraper constructor

A new instance of Scraper.

Constructor Details

#initialize ⇒ `Scraper`

Returns a new instance of Scraper.



44
45
46

# File 'lib/zorki/scrapers/scraper.rb', line 44

def initialize
  Capybara.default_driver = :selenium_zorki
end

Instance Method Details

#get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil) ⇒ `Object`

Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually is used to seed the page. We can just parse this for most things.

additional_search_params is a comma seperated keys example: ‘data,xdt_api_v1_media__shortcode__web_info,items`

# File 'lib/zorki/scrapers/scraper.rb', line 55

def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
  # So this is fun:
  # For pages marked as misinformation we have to use one method (interception of requrest) and
  # for pages that are not, we can just pull the data straight from the page.
  #
  # How do we figure out which is which?... for now we'll just run through both and see where we
  # go with it.

  # Our user data no longer lives in the graphql object passed initially with the page.
  # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
  # the one we want, and then moves on.
  response_body = nil

  page.driver.browser.intercept do |request, &continue|
    # This passes the request forward unmodified, since we only care about the response
    # puts "checking request: #{request.url}"
    continue.call(request) && next unless request.url.include?(subpage_search)

    continue.call(request) do |response|
      # Check if not a CORS prefetch and finish up if not
      if !response.body.empty? && response.body
        check_passed = true
        unless additional_search_parameters.nil?
          body_to_check = Oj.load(response.body)

          search_parameters = additional_search_parameters.split(",")
          search_parameters.each_with_index do |key, index|
            break if body_to_check.nil?

            check_passed = false unless body_to_check.has_key?(key)
            body_to_check = body_to_check[key]
          end
        end

        response_body = response.body if check_passed == true
      end
    end
  rescue Selenium::WebDriver::Error::WebDriverError
    # Eat them
  end

  # Now that the intercept is set up, we visit the page we want
  page.driver.browser.navigate.to(url)
  # We wait until the correct intercept is processed or we've waited 60 seconds
  start_time = Time.now
  # puts "Waiting.... #{url}"

  sleep(rand(1...10))
  while response_body.nil? && (Time.now - start_time) < 60
    sleep(0.1)
  end

  page.driver.execute_script("window.stop();")

  # If this is a page that has not been marked as misinfo we can just pull the data
  # TODO: put this before the whole load loop
  if response_body.nil?
    doc = Nokogiri::HTML(page.driver.browser.page_source)
    # elements = doc.search("script").find_all do |e|
    #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
    # end

    elements = doc.search("script").filter_map do |element|
      parsed_element_json = nil
      begin
        element_json = JSON.parse(element.text)

        parsed_element_json = element_json["require"].first.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
      rescue StandardError
        next
      end

      parsed_element_json
    end

    if elements&.empty?
      raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
    end

    return elements
  end

  raise ContentUnavailableError.new("Response body nil") if response_body.nil?
  Oj.load(response_body)
ensure
  page.quit
  # TRY THIS TO MAKE SURE CHROME GETS CLOSED?
  # We may also want to not do this and make sure the same browser is reused instead for cookie purposes
end