Class: Zorki::Scraper

Inherits:

Object

Object
Zorki::Scraper

show all

Includes:: Capybara::DSL

Defined in:: lib/zorki/scrapers/scraper.rb

Overview

rubocop:disable Metrics/ClassLength

Direct Known Subclasses

PostScraper, UserScraper

Constant Summary collapse

@@logger =

Logger.new(STDOUT)

@@session_id =

nil

Instance Method Summary collapse

#get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil) ⇒ Object

Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually is used to seed the page.
#initialize ⇒ Scraper constructor

A new instance of Scraper.

Constructor Details

#initialize ⇒ `Scraper`

Returns a new instance of Scraper.



44
45
46

# File 'lib/zorki/scrapers/scraper.rb', line 44

def initialize
  Capybara.default_driver = :selenium_zorki
end

Instance Method Details

#get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil) ⇒ `Object`

Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually is used to seed the page. We can just parse this for most things.

additional_search_params is a comma seperated keys example: ‘data,xdt_api_v1_media__shortcode__web_info,items`

NOTE: ‘post_data_include` if not nil overrules the additional_search_parameters This is so that i didn’t have to refactor the entire code base when I added it. Eventually it might be better to look at the post request and see if we can do the same type of search there as we use for users and simplify this whole thing a lot.

# File 'lib/zorki/scrapers/scraper.rb', line 60

def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil, post_data_include: nil)
  # So this is fun:
  # For pages marked as misinformation we have to use one method (interception of requrest) and
  # for pages that are not, we can just pull the data straight from the page.
  #
  # How do we figure out which is which?... for now we'll just run through both and see where we
  # go with it.

  # Our user data no longer lives in the graphql object passed initially with the page.
  # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
  # the one we want, and then moves on.
  response_body = nil

  page.driver.browser.intercept do |request, &continue|
    # This passes the request forward unmodified, since we only care about the response
    continue.call(request) && next unless request.url.include?(subpage_search)
    continue.call(request) && next unless !post_data_include.nil? && request.post_data&.include?(post_data_include)

    continue.call(request) do |response|
      puts "***********************************************************"
      puts "checking request: #{request.url}"
      puts response.body
      puts "***********************************************************"

      # responses << response
      # Check if not a CORS prefetch and finish up if not
      if !response.body&.empty? && response.body
        check_passed = true

        if !additional_search_parameters.nil? && post_data_include.nil?
          body_to_check = Oj.load(response.body)

          search_parameters = additional_search_parameters.split(",")
          search_parameters.each_with_index do |key, index|
            break if body_to_check.nil?

            check_passed = false unless body_to_check.has_key?(key)
            body_to_check = body_to_check[key]
          end
        end

        if check_passed == false
          puts "***********************************************************"
          puts "checking FAILED request: #{request.url}"
          puts response.body
          puts "***********************************************************"
        end

        response_body = response.body if check_passed == true
      end
    end
  rescue Selenium::WebDriver::Error::WebDriverError
    # Eat them
  rescue StandardError => e
    puts "***********************************************************"
    puts "Error in intercept: #{e}"
    puts "***********************************************************"
  end

  # Now that the intercept is set up, we visit the page we want
  page.driver.browser.navigate.to(url)
  # We wait until the correct intercept is processed or we've waited 60 seconds
  start_time = Time.now
  # puts "Waiting.... #{url}"

  sleep(rand(1...10))
  while response_body.nil? && (Time.now - start_time) < 60
    sleep(0.1)
  end

  page.driver.execute_script("window.stop();")

  # If this is a page that has not been marked as misinfo we can just pull the data
  # TODO: put this before the whole load loop
  if response_body.nil?

    doc = Nokogiri::HTML(page.driver.browser.page_source)
    # elements = doc.search("script").find_all do |e|
    #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
    # end

    elements = doc.search("script").filter_map do |element|
      parsed_element_json = nil
      begin
        element_json = OJ.load(element.text)

        # if element.text.include?("jokoy.komi.io")
        # debugger
        # if element_json["require"].first.last.first["__bbox"].key?("require")

        #   element_json["require"].first.last.first["__bbox"]["require"].each do |x|
        #     debugger if x.to_s.include?("Si mulut pelaut")
        #   end
        # end
        # end

        parsed_element_json = element_json["require"].last.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
      rescue StandardError
        next
      end

      parsed_element_json
    end

    if elements&.empty?
      # debugger
      raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
    end

    return elements
  end

  # debugger if response_body.nil?
  raise ContentUnavailableError.new("Response body nil") if response_body.nil?
  Oj.load(response_body)
ensure
  # page.quit
  # TRY THIS TO MAKE SURE CHROME GETS CLOSED?
  # We may also want to not do this and make sure the same browser is reused instead for cookie purposes
  # NOW wer'e trying this 2024-05-28
end