Class: Zorki::Scraper

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL
Defined in:
lib/zorki/scrapers/scraper.rb

Overview

rubocop:disable Metrics/ClassLength

Direct Known Subclasses

PostScraper, UserScraper

Constant Summary collapse

@@logger =
Logger.new(STDOUT)
@@session_id =
nil

Instance Method Summary collapse

Constructor Details

#initializeScraper

Returns a new instance of Scraper.



44
45
46
# File 'lib/zorki/scrapers/scraper.rb', line 44

def initialize
  Capybara.default_driver = :selenium_zorki
end

Instance Method Details

#get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil) ⇒ Object

Instagram uses GraphQL (like most of Facebook I think), and returns an object that actually is used to seed the page. We can just parse this for most things.

additional_search_params is a comma seperated keys example: ‘data,xdt_api_v1_media__shortcode__web_info,items`



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/zorki/scrapers/scraper.rb', line 55

def get_content_of_subpage_from_url(url, subpage_search, additional_search_parameters = nil)
  # So this is fun:
  # For pages marked as misinformation we have to use one method (interception of requrest) and
  # for pages that are not, we can just pull the data straight from the page.
  #
  # How do we figure out which is which?... for now we'll just run through both and see where we
  # go with it.

  # Our user data no longer lives in the graphql object passed initially with the page.
  # Instead it comes in as part of a subsequent call. This intercepts all calls, checks if it's
  # the one we want, and then moves on.
  response_body = nil

  page.driver.browser.intercept do |request, &continue|
    # This passes the request forward unmodified, since we only care about the response
    # puts "checking request: #{request.url}"
    continue.call(request) && next unless request.url.include?(subpage_search)

    continue.call(request) do |response|
      # Check if not a CORS prefetch and finish up if not
      if !response.body.empty? && response.body
        check_passed = true
        unless additional_search_parameters.nil?
          body_to_check = Oj.load(response.body)

          search_parameters = additional_search_parameters.split(",")
          search_parameters.each_with_index do |key, index|
            break if body_to_check.nil?

            check_passed = false unless body_to_check.has_key?(key)
            body_to_check = body_to_check[key]
          end
        end

        response_body = response.body if check_passed == true
      end
    end
  rescue Selenium::WebDriver::Error::WebDriverError
    # Eat them
  end

  # Now that the intercept is set up, we visit the page we want
  page.driver.browser.navigate.to(url)
  # We wait until the correct intercept is processed or we've waited 60 seconds
  start_time = Time.now
  # puts "Waiting.... #{url}"

  sleep(rand(1...10))
  while response_body.nil? && (Time.now - start_time) < 60
    sleep(0.1)
  end

  page.driver.execute_script("window.stop();")

  # If this is a page that has not been marked as misinfo we can just pull the data
  # TODO: put this before the whole load loop
  if response_body.nil?
    doc = Nokogiri::HTML(page.driver.browser.page_source)
    # elements = doc.search("script").find_all do |e|
    #   e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
    # end

    elements = doc.search("script").filter_map do |element|
      parsed_element_json = nil
      begin
        element_json = JSON.parse(element.text)

        parsed_element_json = element_json["require"].first.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
      rescue StandardError
        next
      end

      parsed_element_json
    end

    if elements&.empty?
      raise ContentUnavailableError.new("Cannot find anything", additional_data: { page_source: page.driver.browser.page_source, elements: elements })
    end

    return elements
  end

  raise ContentUnavailableError.new("Response body nil") if response_body.nil?
  Oj.load(response_body)
ensure
  page.quit
  # TRY THIS TO MAKE SURE CHROME GETS CLOSED?
  # We may also want to not do this and make sure the same browser is reused instead for cookie purposes
end