Class: IndeedCrawler

Inherits:

Object

Object
IndeedCrawler

show all

Defined in:: lib/indeedcrawler.rb

Instance Method Summary collapse

#add_location(url) ⇒ Object

Append location.
#add_query(url) ⇒ Object

Append query.
#collect_it_all ⇒ Object

Get all the profile links.
#get_page_links(html) ⇒ Object

Get the links on the page.
#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ IndeedCrawler constructor

A new instance of IndeedCrawler.
#load_next_page(html) ⇒ Object

Load the next page.
#load_restart_page(url, count) ⇒ Object

Load the page and return or restart and retry if needed.
#parse_resumes ⇒ Object

Download and parse all resumes.

Constructor Details

#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ `IndeedCrawler`

Returns a new instance of IndeedCrawler.

# File 'lib/indeedcrawler.rb', line 8

def initialize(search_query, location, proxy_list, wait_time, browser_num)
  # Info for query
  @search_query = search_query
  @location = location

  # Settings for request manager
  @requests = RequestManager.new(proxy_list, wait_time, browser_num)

  # Result tracking
  @all_resume_links = Array.new
  @output = Array.new
end

Instance Method Details

#add_location(url) ⇒ `Object`

Append location

# File 'lib/indeedcrawler.rb', line 27

def add_location(url)
  url += "&" if @search_query
  url += "l="+URI.encode_www_form([@location])
end

#add_query(url) ⇒ `Object`

Append query



22
23
24

# File 'lib/indeedcrawler.rb', line 22

def add_query(url)
  url += "&q="+URI.encode_www_form([@search_query])
end

#collect_it_all ⇒ `Object`

Get all the profile links

# File 'lib/indeedcrawler.rb', line 83

def collect_it_all
  # Generate URL
  url = "http://indeed.com/resumes?co=US"
  url = add_query(url) if @search_query
  url = add_location(url) if @location

  # Get first page and navigate the rest
  page_body = load_restart_page(url, 0)
  html = Nokogiri::HTML(page_body)
  get_page_links(html)

  # Get and parse all results
  parse_resumes

  # Close browsers when done and return results
  @requests.close_all_browsers
  return JSON.pretty_generate(@output)
end

#get_page_links(html) ⇒ `Object`

Get the links on the page

# File 'lib/indeedcrawler.rb', line 33

def get_page_links(html)
  # Get list of people
  profiles = html.xpath("//li[@itemtype='http://schema.org/Person']")

  # Get each profile link
  profiles.each do |profile|
    @all_resume_links.push("http://indeed.com"+profile.xpath(".//a[@class='app_link']")[0]['href'])
  end

  # Navigate to next page if there's a class to do that
  load_next_page(html) if !html.css("a.next").empty?
end

#load_next_page(html) ⇒ `Object`

Load the next page

# File 'lib/indeedcrawler.rb', line 47

def load_next_page(html)
  next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
  get_page_links(Nokogiri::HTML(next_html))
end

#load_restart_page(url, count) ⇒ `Object`

Load the page and return or restart and retry if needed

# File 'lib/indeedcrawler.rb', line 53

def load_restart_page(url, count)
  begin
    return @requests.get_page(url)
  rescue
    if count < 2
      @requests.restart_browser
      load_restart_page(url, count+=1)
    end
  end
end

#parse_resumes ⇒ `Object`

Download and parse all resumes

# File 'lib/indeedcrawler.rb', line 65

def parse_resumes
  @all_resume_links.each do |link|
    resume = load_restart_page(link, 0)
    
    begin
      # Parse resume and add to results
      i = IndeedParser.new(resume, link, {time_scraped: Time.now})
      results = JSON.parse(i.get_results_by_job)
    
      results.each do |result|
        @output.push(result)
      end
    rescue
    end
  end
end

Class: IndeedCrawler

Instance Method Summary collapse

Constructor Details

#initialize(search_query, location, proxy_list, wait_time, browser_num) ⇒ IndeedCrawler

Instance Method Details

#add_location(url) ⇒ Object

#add_query(url) ⇒ Object

#collect_it_all ⇒ Object

#get_page_links(html) ⇒ Object

#load_next_page(html) ⇒ Object

#load_restart_page(url, count) ⇒ Object

#parse_resumes ⇒ Object