Class: IndeedCrawler
- Inherits:
-
Object
- Object
- IndeedCrawler
- Defined in:
- lib/indeedcrawler.rb
Instance Method Summary collapse
-
#add_location(url) ⇒ Object
Append location.
-
#add_query(url) ⇒ Object
Append query.
-
#collect_it_all ⇒ Object
Get all the profile links.
-
#get_page_links(html) ⇒ Object
Get the links on the page.
-
#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ IndeedCrawler
constructor
A new instance of IndeedCrawler.
-
#load_next_page(html) ⇒ Object
Load the next page.
-
#load_restart_page(url, count) ⇒ Object
Load the page and return or restart and retry if needed.
-
#parse_resumes ⇒ Object
Download and parse all resumes.
Constructor Details
#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ IndeedCrawler
Returns a new instance of IndeedCrawler.
8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/indeedcrawler.rb', line 8 def initialize(search_query, location, proxy_list, wait_time, browser_num) # Info for query @search_query = search_query @location = location # Settings for request manager @requests = RequestManager.new(proxy_list, wait_time, browser_num) # Result tracking @all_resume_links = Array.new @output = Array.new end |
Instance Method Details
#add_location(url) ⇒ Object
Append location
27 28 29 30 |
# File 'lib/indeedcrawler.rb', line 27 def add_location(url) url += "&" if @search_query url += "l="+URI.encode_www_form([@location]) end |
#add_query(url) ⇒ Object
Append query
22 23 24 |
# File 'lib/indeedcrawler.rb', line 22 def add_query(url) url += "&q="+URI.encode_www_form([@search_query]) end |
#collect_it_all ⇒ Object
Get all the profile links
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/indeedcrawler.rb', line 83 def collect_it_all # Generate URL url = "http://indeed.com/resumes?co=US" url = add_query(url) if @search_query url = add_location(url) if @location # Get first page and navigate the rest page_body = load_restart_page(url, 0) html = Nokogiri::HTML(page_body) get_page_links(html) # Get and parse all results parse_resumes # Close browsers when done and return results @requests.close_all_browsers return JSON.pretty_generate(@output) end |
#get_page_links(html) ⇒ Object
Get the links on the page
33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/indeedcrawler.rb', line 33 def get_page_links(html) # Get list of people profiles = html.xpath("//li[@itemtype='http://schema.org/Person']") # Get each profile link profiles.each do |profile| @all_resume_links.push("http://indeed.com"+profile.xpath(".//a[@class='app_link']")[0]['href']) end # Navigate to next page if there's a class to do that load_next_page(html) if !html.css("a.next").empty? end |
#load_next_page(html) ⇒ Object
Load the next page
47 48 49 50 |
# File 'lib/indeedcrawler.rb', line 47 def load_next_page(html) next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0) get_page_links(Nokogiri::HTML(next_html)) end |
#load_restart_page(url, count) ⇒ Object
Load the page and return or restart and retry if needed
53 54 55 56 57 58 59 60 61 62 |
# File 'lib/indeedcrawler.rb', line 53 def load_restart_page(url, count) begin return @requests.get_page(url) rescue if count < 2 @requests.restart_browser load_restart_page(url, count+=1) end end end |
#parse_resumes ⇒ Object
Download and parse all resumes
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/indeedcrawler.rb', line 65 def parse_resumes @all_resume_links.each do |link| resume = load_restart_page(link, 0) begin # Parse resume and add to results i = IndeedParser.new(resume, link, {time_scraped: Time.now}) results = JSON.parse(i.get_results_by_job) results.each do |result| @output.push(result) end rescue end end end |