Class: IndeedCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/indeedcrawler.rb

Instance Method Summary collapse

Constructor Details

#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ IndeedCrawler

Returns a new instance of IndeedCrawler.



8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/indeedcrawler.rb', line 8

def initialize(search_query, location, proxy_list, wait_time, browser_num)
  # Info for query
  @search_query = search_query
  @location = location

  # Settings for request manager
  @requests = RequestManager.new(proxy_list, wait_time, browser_num)

  # Result tracking
  @all_resume_links = Array.new
  @output = Array.new
end

Instance Method Details

#add_location(url) ⇒ Object

Append location



27
28
29
30
# File 'lib/indeedcrawler.rb', line 27

def add_location(url)
  url += "&" if @search_query
  url += "l="+URI.encode_www_form([@location])
end

#add_query(url) ⇒ Object

Append query



22
23
24
# File 'lib/indeedcrawler.rb', line 22

def add_query(url)
  url += "&q="+URI.encode_www_form([@search_query])
end

#collect_it_allObject

Get all the profile links



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/indeedcrawler.rb', line 83

def collect_it_all
  # Generate URL
  url = "http://indeed.com/resumes?co=US"
  url = add_query(url) if @search_query
  url = add_location(url) if @location

  # Get first page and navigate the rest
  page_body = load_restart_page(url, 0)
  html = Nokogiri::HTML(page_body)
  get_page_links(html)

  # Get and parse all results
  parse_resumes

  # Close browsers when done and return results
  @requests.close_all_browsers
  return JSON.pretty_generate(@output)
end

Get the links on the page



33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/indeedcrawler.rb', line 33

def get_page_links(html)
  # Get list of people
  profiles = html.xpath("//li[@itemtype='http://schema.org/Person']")

  # Get each profile link
  profiles.each do |profile|
    @all_resume_links.push("http://indeed.com"+profile.xpath(".//a[@class='app_link']")[0]['href'])
  end

  # Navigate to next page if there's a class to do that
  load_next_page(html) if !html.css("a.next").empty?
end

#load_next_page(html) ⇒ Object

Load the next page



47
48
49
50
# File 'lib/indeedcrawler.rb', line 47

def load_next_page(html)
  next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
  get_page_links(Nokogiri::HTML(next_html))
end

#load_restart_page(url, count) ⇒ Object

Load the page and return or restart and retry if needed



53
54
55
56
57
58
59
60
61
62
# File 'lib/indeedcrawler.rb', line 53

def load_restart_page(url, count)
  begin
    return @requests.get_page(url)
  rescue
    if count < 2
      @requests.restart_browser
      load_restart_page(url, count+=1)
    end
  end
end

#parse_resumesObject

Download and parse all resumes



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/indeedcrawler.rb', line 65

def parse_resumes
  @all_resume_links.each do |link|
    resume = load_restart_page(link, 0)
    
    begin
      # Parse resume and add to results
      i = IndeedParser.new(resume, link, {time_scraped: Time.now})
      results = JSON.parse(i.get_results_by_job)
    
      results.each do |result|
        @output.push(result)
      end
    rescue
    end
  end
end