Class: SecurityClearedJobsComCrawler

Inherits:
Object
  • Object
show all
Includes:
FailureHandler
Defined in:
lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb

Instance Method Summary collapse

Methods included from FailureHandler

#get_retry

Constructor Details

#initialize(search_term, requests = nil, cm_hash = nil) ⇒ SecurityClearedJobsComCrawler

Returns a new instance of SecurityClearedJobsComCrawler.



15
16
17
18
19
20
21
22
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 15

def initialize(search_term, requests=nil, cm_hash=nil)
  @search_term = search_term
  @requests = requests
  @site_url = "https://www.securityclearedjobs.com"
  @query_base_url = set_base_url

  @reporter = HarvesterReporter.new(cm_hash)
end

Instance Method Details

#crawlObject

Crawls all of the listings



78
79
80
81
82
83
84
85
86
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 78

def crawl
  total_pagecount = get_total_pagecount
  
  # Load each page
  (1..total_pagecount.to_i).each do |page_num|
    next_page = load_next_page(page_num)
    parse_listings(next_page)
  end
end

#gen_jsonObject

Output JSON



89
90
91
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 89

def gen_json
  return @reporter.gen_json
end

#get_page(url) ⇒ Object

Get the page



34
35
36
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 34

def get_page(url)
  get_retry(url, @requests, 0)
end

#get_total_pagecountObject

Get the total pagecount



39
40
41
42
43
44
45
46
47
48
49
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 39

def get_total_pagecount
  initial_page = Nokogiri::HTML.parse(load_next_page(1))
  navbar = initial_page.css(".paginator__item").last
  last_page_link = navbar.css("a")[0]['href'] if navbar

  # Handle case of there just being one page
  if last_page_link
    page_count = last_page_link.split("&Page=")[1].to_i
    page_count == 0 ? (return 1) : (return page_count)
  end
end

#load_next_page(page_num) ⇒ Object

Load the next page



52
53
54
55
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 52

def load_next_page(page_num)
  next_page_url = @query_base_url + "&Page="+page_num.to_s
  return get_page(next_page_url)
end

#parse_listings(page) ⇒ Object

Parse all the listings on a single page



64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 64

def parse_listings(page)
  listing_links = save_result_links(page)
  found_listings = Array.new
  
  listing_links.each do |listing|
    parser = SecurityClearedJobsComParser.new(listing, get_page(listing), @requests)
    parsed_listing = parser.parse
    found_listings.push(parsed_listing) if parsed_listing
  end
  
  @reporter.report_results(found_listings, listing_links.first)
end

Save the result links on a page



58
59
60
61
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 58

def save_result_links(page)
  html = Nokogiri::HTML.parse(page)
  return html.css(".lister__header").css("a").map{|e| @site_url+e['href']}
end

#set_base_urlObject

Set the base url for the query



25
26
27
28
29
30
31
# File 'lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb', line 25

def set_base_url
  if @search_term == nil
    return @site_url+"/searchjobs/?countrycode=GB"
  else
    return @site_url+"/searchjobs/?countrycode=GB&Keywords="+CGI.escape(@search_term)
  end
end