Class: Forki::Scraper

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL
Defined in:
lib/forki/scrapers/scraper.rb

Overview

rubocop:disable Metrics/ClassLength

Direct Known Subclasses

PostScraper, UserScraper

Instance Method Summary collapse

Constructor Details

#initializeScraper

Returns a new instance of Scraper.



38
39
40
41
42
# File 'lib/forki/scrapers/scraper.rb', line 38

def initialize
  Capybara.default_driver = :selenium_forki
  Forki.set_logger_level
  # reset_selenium
end

Instance Method Details

#download_image(img_elem) ⇒ Object

Yeah, just use the tmp/ directory that’s created during setup



45
46
47
48
# File 'lib/forki/scrapers/scraper.rb', line 45

def download_image(img_elem)
  img_data = URI.open(img_elem["src"]).read
  File.binwrite("temp/emoji.png", img_data)
end

#find_graphql_data_closure_index(html_str, start_index) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/forki/scrapers/scraper.rb', line 64

def find_graphql_data_closure_index(html_str, start_index)
  closure_index = start_index + 8 # length of data marker. Begin search right after open brace
  raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length

  brace_stack = 1
  loop do  # search for brace characters in substring instead of iterating through each char
    if html_str[closure_index] == "{"
      brace_stack += 1
    elsif html_str[closure_index] == "}"
      brace_stack -= 1
    end

    closure_index += 1
    break if brace_stack.zero?
  end

  closure_index
end

#find_graphql_data_strings(objs = [], html_str) ⇒ Object

Returns all GraphQL data objects embedded within a string Finds substrings that look like ‘“data”: …’ and converts them to hashes



52
53
54
55
56
57
58
59
60
61
62
# File 'lib/forki/scrapers/scraper.rb', line 52

def find_graphql_data_strings(objs = [], html_str)
  data_marker = '"data":{'
  data_start_index = html_str.index(data_marker)
  return objs if data_start_index.nil? # No more data blocks in the page source

  data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
  return objs if data_closure_index.nil?

  graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
  objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
end