Class: ClearanceJobsComCrawler

Inherits:
Object
  • Object
show all
Includes:
FailureHandler
Defined in:
lib/clearancejobscom/clearance_jobs_com_crawler.rb

Instance Method Summary collapse

Methods included from FailureHandler

#get_retry

Constructor Details

#initialize(search_term, requests = nil, cm_hash = nil) ⇒ ClearanceJobsComCrawler

Returns a new instance of ClearanceJobsComCrawler.



15
16
17
18
19
20
21
22
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 15

def initialize(search_term, requests=nil, cm_hash=nil)
  @search_term = search_term
  @requests = requests
  @base_url = set_base_url

  # Handle crawler manager info
  @reporter = HarvesterReporter.new(cm_hash)
end

Instance Method Details

Collect the links on the page



70
71
72
73
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 70

def collect_links_on_page(page)
  html = Nokogiri::HTML.parse(page)
  return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
end

#crawlObject

Run the crawler



25
26
27
28
29
30
31
32
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 25

def crawl
  page_count = get_page_count
  
  (1..page_count).each do |page_num|
    listing_links = collect_links_on_page(get_next_page(page_num))
    parse_listings(listing_links)
  end
end

#gen_jsonObject



87
88
89
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 87

def gen_json
  return @reporter.gen_json
end

#get_next_page(page_num) ⇒ Object

Get the next page



65
66
67
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 65

def get_next_page(page_num)
  return get_page(get_next_page_url(page_num))
end

#get_next_page_url(page_num) ⇒ Object

Get the URL for the next page



44
45
46
47
48
49
50
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 44

def get_next_page_url(page_num)
  if @base_url.include?("keywords")
    return @base_url+"&PAGE="+page_num.to_s+"&limit=25"
  else
    return @base_url+"PAGE="+page_num.to_s+"&limit=25"
  end
end

#get_page(url) ⇒ Object

Get the page



53
54
55
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 53

def get_page(url)
  get_retry(url, @requests, 0)
end

#get_page_countObject

Get the correct total # of pages



58
59
60
61
62
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 58

def get_page_count
  page_html = Nokogiri::HTML.parse(get_next_page(1))
  result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
  return (result_count/25.0).ceil
end

#parse_listings(listings) ⇒ Object

Parse the listings on the page



76
77
78
79
80
81
82
83
84
85
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 76

def parse_listings(listings)
  found_listings = Array.new
  listings.each do |listing|
    parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
    parsed_listing = parser.parse
    found_listings.push(parsed_listing) if parsed_listing
  end

  @reporter.report_results(found_listings, listings.first)
end

#set_base_urlObject

Get base url



35
36
37
38
39
40
41
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 35

def set_base_url
  if @search_term == nil
    @base_url = "https://www.clearancejobs.com/jobs?"
  else
    @base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)
  end
end