Class: GeneralScraper

Inherits:
Object
  • Object
show all
Includes:
ParsePage
Defined in:
lib/generalscraper.rb

Instance Method Summary collapse

Methods included from ParsePage

#fixEncode, #getContent, #getHTMLText, #getMetadata, #getPDF, #getPageData

Constructor Details

#initialize(operators, searchterm, requests, solver_details) ⇒ GeneralScraper

Returns a new instance of GeneralScraper.



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/generalscraper.rb', line 13

def initialize(operators, searchterm, requests, solver_details)
  @operators = operators
  @searchterm = searchterm
  @op_val = @operators.split(" ")[0].split(":")[1]
  @requests = requests
  @solver_details = solver_details
  
  @output = Array.new
  @urllist = Array.new
  @startindex = 10
end

Instance Method Details

#check_results(page, *requested_page) ⇒ Object

Check that page with links loaded



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/generalscraper.rb', line 32

def check_results(page, *requested_page)
  if page.include?("To continue, please type the characters below:")
    # Solve CAPTCHA if enabled
    if @solver_details
      c = Captcha.new(@requests, @solver_details)
      c.solve
      
      # Proceed as normal
      sleep(1)
      check_results(@requests.get_updated_current_page)
      
    else # Restart and try again if CAPTCHA-solving not enabled
      @requests.restart_browser
      check_results(@requests.get_page(requested_page), requested_page)
    end
  else # No CAPTCHA found :)
    navigate_save_results(page)
  end
end

Gets the links from the page that match css selector in block



53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/generalscraper.rb', line 53

def get_links(page, &block)
  html = Nokogiri::HTML(page)

  # Get array of links
  return yield(html).inject(Array.new) do |link_arr, al|
    begin
      link_arr.push(al["href"])
    rescue
      
    end
   
    link_arr
  end
end

#getDataObject

Gets all data and returns in JSON



99
100
101
102
103
104
105
106
107
# File 'lib/generalscraper.rb', line 99

def getData
  search
  @urllist.each do |url|
    getPageData(url)
  end

  @requests.close_all_browsers
  return JSON.pretty_generate(@output)
end

#getURLsObject

Returns a list of search result URLs



110
111
112
113
114
# File 'lib/generalscraper.rb', line 110

def getURLs
  search
  @requests.close_all_browsers
  return JSON.pretty_generate(@urllist)
end

Categorizes the links on results page into results and other search pages



69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/generalscraper.rb', line 69

def navigate_save_results(page)
  # Save result links for page
  result_links = get_links(page) {|html| html.css("h3.r").css("a")}
  result_links.each do |link|
    site_url_save(link)
  end

  # Go to next page
  next_pages = get_links(page) {|html| html.css("#pnnext")}
  next_pages.each do |link|
    next_search_page("google.com"+link)
  end
end

#next_search_page(link) ⇒ Object

Process search links and go to next page



89
90
91
92
93
94
95
96
# File 'lib/generalscraper.rb', line 89

def next_search_page(link)
  page_index_num = link.split("&start=")[1].split("&sa=N")[0]

  if page_index_num.to_i == @startindex
    @startindex += 10
    check_results(@requests.get_page(link), link)
  end
end

#searchObject

Searches for links on Google



26
27
28
29
# File 'lib/generalscraper.rb', line 26

def search
  check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
                "http://google.com", (@operators + " " + @searchterm))
end

#site_url_save(link) ⇒ Object

Parse and save the URLs for search results



84
85
86
# File 'lib/generalscraper.rb', line 84

def site_url_save(link)
  @urllist.push(link)
end