Class: RubyScraper
- Inherits:
-
Object
- Object
- RubyScraper
- Includes:
- Capybara::DSL
- Defined in:
- lib/rubyscraper.rb,
lib/rubyscraper/binary.rb,
lib/rubyscraper/version.rb
Defined Under Namespace
Classes: Binary
Constant Summary collapse
- VERSION =
"0.3.0"
Instance Attribute Summary collapse
-
#endpoint ⇒ Object
readonly
Returns the value of attribute endpoint.
-
#jobs ⇒ Object
readonly
Returns the value of attribute jobs.
-
#pages ⇒ Object
readonly
Returns the value of attribute pages.
-
#posted_jobs ⇒ Object
readonly
Returns the value of attribute posted_jobs.
-
#scrape_config ⇒ Object
readonly
Returns the value of attribute scrape_config.
-
#scraped_jobs ⇒ Object
readonly
Returns the value of attribute scraped_jobs.
Instance Method Summary collapse
- #get_bodies(site) ⇒ Object
- #get_data(site) ⇒ Object
- #get_summaries(site) ⇒ Object
-
#initialize(endpoint, pages = 1) ⇒ RubyScraper
constructor
A new instance of RubyScraper.
- #modify_data(site, job) ⇒ Object
- #pull_job_data(site, job) ⇒ Object
- #pull_summary_data(site, listing) ⇒ Object
- #scrape(single_site = nil) ⇒ Object
- #send_to_server ⇒ Object
Constructor Details
#initialize(endpoint, pages = 1) ⇒ RubyScraper
Returns a new instance of RubyScraper.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/rubyscraper.rb', line 10 def initialize(endpoint, pages=1) .register_driver :poltergeist do |app| ::Poltergeist::Driver.new(app, js_errors: false) end .default_driver = :poltergeist @jobs = [] @scraped_jobs = 0 @posted_jobs = 0 @pages = pages @endpoint = endpoint @scrape_file = File.('../assets/scrapes.json', __FILE__) @scrape_config = JSON.parse(File.read(@scrape_file)) end |
Instance Attribute Details
#endpoint ⇒ Object (readonly)
Returns the value of attribute endpoint.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def endpoint @endpoint end |
#jobs ⇒ Object (readonly)
Returns the value of attribute jobs.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def jobs @jobs end |
#pages ⇒ Object (readonly)
Returns the value of attribute pages.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def pages @pages end |
#posted_jobs ⇒ Object (readonly)
Returns the value of attribute posted_jobs.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def posted_jobs @posted_jobs end |
#scrape_config ⇒ Object (readonly)
Returns the value of attribute scrape_config.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def scrape_config @scrape_config end |
#scraped_jobs ⇒ Object (readonly)
Returns the value of attribute scraped_jobs.
8 9 10 |
# File 'lib/rubyscraper.rb', line 8 def scraped_jobs @scraped_jobs end |
Instance Method Details
#get_bodies(site) ⇒ Object
99 100 101 102 103 104 105 |
# File 'lib/rubyscraper.rb', line 99 def get_bodies(site) jobs.each_with_index do |job, i| sleep 1 pull_job_data(site, job) puts "Job #{i+1} pulled." end end |
#get_data(site) ⇒ Object
43 44 45 46 47 |
# File 'lib/rubyscraper.rb', line 43 def get_data(site) get_summaries(site) get_bodies(site) send_to_server end |
#get_summaries(site) ⇒ Object
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/rubyscraper.rb', line 49 def get_summaries(site) if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"] site["summary"]["params"][0]["SEARCHTERM"].each do |term| summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}" pagination_start = site["summary"]["pagination_start"].to_i pagination_end = pagination_start + pages - 1 (pagination_start..pagination_end).to_a.each do |page| visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}" all(site["summary"]["loop"]).each do |listing| job = pull_summary_data(site, listing) job = modify_data(site, job) jobs << job end puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries." end end else summary_url = "#{site["base_url"]}#{site["summary"]["url"]}" visit summary_url all(site["summary"]["loop"]).each do |listing| job = pull_summary_data(site, listing) job = modify_data(site, job) jobs << job end puts "Pulled #{site["name"]} job summaries." end end |
#modify_data(site, job) ⇒ Object
94 95 96 97 |
# File 'lib/rubyscraper.rb', line 94 def modify_data(site, job) job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/) job end |
#pull_job_data(site, job) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/rubyscraper.rb', line 107 def pull_job_data(site, job) visit job["url"] site["sub_page"]["fields"].each do |field| if field["method"] == "all" if has_css?(field["path"]) values = all(field["path"]).map do |elem| elem.send(field["loop_collect"]) end job[field["field"]] = values.join(field["join"]) end else if has_css?(field["path"]) job[field["field"]] = send(field["method"].to_sym,field["path"]).text end end end end |
#pull_summary_data(site, listing) ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/rubyscraper.rb', line 77 def pull_summary_data(site, listing) job = Hash.new site["summary"]["fields"].each do |field| if field["attr"] if listing.has_css?(field["path"]) job[field["field"]] = listing.send(field["method"].to_sym, field["path"])[field["attr"]] end else if listing.has_css?(field["path"]) job[field["field"]] = listing.send(field["method"].to_sym, field["path"]).text end end end; job end |
#scrape(single_site = nil) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/rubyscraper.rb', line 25 def scrape(single_site=nil) if single_site search_site = scrape_config.select { |site| site["name"] == single_site } if search_site get_data(search_site.first) else raise "Invalid single site name #{single_site}. Not in scrape file." end else scrape_config.each do |site| unless site["skip"] == "true" get_data(site) end end end return scraped_jobs, posted_jobs end |
#send_to_server ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/rubyscraper.rb', line 126 def send_to_server @scraped_jobs += jobs.length jobs.each do |job| = job["tags"] || "" new_job = { position: job["position"], location: job["location"], description: job["description"], source: job["url"], company: job["company"], tags: .split(", ") } RestClient.post(endpoint, job: new_job){ |response, request, result, &block| case response.code when 201 @posted_jobs += 1 puts "Job saved." when 302 puts "Job already exists." else puts "Bad request." end } end @jobs = [] end |