Class: GeneralScraper
- Inherits:
-
Object
- Object
- GeneralScraper
- Includes:
- ParsePage
- Defined in:
- lib/generalscraper.rb
Instance Method Summary collapse
-
#check_results(page, *requested_page) ⇒ Object
Check that page with links loaded.
-
#get_links(page, &block) ⇒ Object
Gets the links from the page that match css selector in block.
-
#getData ⇒ Object
Gets all data and returns in JSON.
-
#getURLs ⇒ Object
Returns a list of search result URLs.
-
#initialize(operators, searchterm, requests, solver_details) ⇒ GeneralScraper
constructor
A new instance of GeneralScraper.
-
#navigate_save_results(page) ⇒ Object
Categorizes the links on results page into results and other search pages.
-
#next_search_page(link) ⇒ Object
Process search links and go to next page.
-
#search ⇒ Object
Searches for links on Google.
-
#site_url_save(link) ⇒ Object
Parse and save the URLs for search results.
Methods included from ParsePage
#fixEncode, #getContent, #getHTMLText, #getMetadata, #getPDF, #getPageData
Constructor Details
#initialize(operators, searchterm, requests, solver_details) ⇒ GeneralScraper
Returns a new instance of GeneralScraper.
13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/generalscraper.rb', line 13 def initialize(operators, searchterm, requests, solver_details) @operators = operators @searchterm = searchterm @op_val = @operators.split(" ")[0].split(":")[1] @requests = requests @solver_details = solver_details @output = Array.new @urllist = Array.new @startindex = 10 end |
Instance Method Details
#check_results(page, *requested_page) ⇒ Object
Check that page with links loaded
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/generalscraper.rb', line 32 def check_results(page, *requested_page) if page.include?("To continue, please type the characters below:") # Solve CAPTCHA if enabled if @solver_details c = Captcha.new(@requests, @solver_details) c.solve # Proceed as normal sleep(1) check_results(@requests.get_updated_current_page) else # Restart and try again if CAPTCHA-solving not enabled @requests.restart_browser check_results(@requests.get_page(requested_page), requested_page) end else # No CAPTCHA found :) navigate_save_results(page) end end |
#get_links(page, &block) ⇒ Object
Gets the links from the page that match css selector in block
53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/generalscraper.rb', line 53 def get_links(page, &block) html = Nokogiri::HTML(page) # Get array of links return yield(html).inject(Array.new) do |link_arr, al| begin link_arr.push(al["href"]) rescue end link_arr end end |
#getData ⇒ Object
Gets all data and returns in JSON
99 100 101 102 103 104 105 106 107 |
# File 'lib/generalscraper.rb', line 99 def getData search @urllist.each do |url| getPageData(url) end @requests.close_all_browsers return JSON.pretty_generate(@output) end |
#getURLs ⇒ Object
Returns a list of search result URLs
110 111 112 113 114 |
# File 'lib/generalscraper.rb', line 110 def getURLs search @requests.close_all_browsers return JSON.pretty_generate(@urllist) end |
#navigate_save_results(page) ⇒ Object
Categorizes the links on results page into results and other search pages
69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/generalscraper.rb', line 69 def navigate_save_results(page) # Save result links for page result_links = get_links(page) {|html| html.css("h3.r").css("a")} result_links.each do |link| site_url_save(link) end # Go to next page next_pages = get_links(page) {|html| html.css("#pnnext")} next_pages.each do |link| next_search_page("google.com"+link) end end |
#next_search_page(link) ⇒ Object
Process search links and go to next page
89 90 91 92 93 94 95 96 |
# File 'lib/generalscraper.rb', line 89 def next_search_page(link) page_index_num = link.split("&start=")[1].split("&sa=N")[0] if page_index_num.to_i == @startindex @startindex += 10 check_results(@requests.get_page(link), link) end end |
#search ⇒ Object
Searches for links on Google
26 27 28 29 |
# File 'lib/generalscraper.rb', line 26 def search check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm), "http://google.com", (@operators + " " + @searchterm)) end |
#site_url_save(link) ⇒ Object
Parse and save the URLs for search results
84 85 86 |
# File 'lib/generalscraper.rb', line 84 def site_url_save(link) @urllist.push(link) end |