Class: RubyScraper

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL
Defined in:
lib/rubyscraper.rb,
lib/rubyscraper/binary.rb,
lib/rubyscraper/version.rb

Defined Under Namespace

Classes: Binary

Constant Summary collapse

VERSION =
"0.2.0"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(endpoint, pages = 1) ⇒ RubyScraper

Returns a new instance of RubyScraper.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/rubyscraper.rb', line 10

def initialize(endpoint, pages=1)
  Capybara.register_driver :poltergeist do |app|
    Capybara::Poltergeist::Driver.new(app, js_errors: false)
  end
  Capybara.default_driver = :poltergeist

  @jobs = []
  @scraped_jobs = 0
  @posted_jobs = 0
  @pages = pages
  @endpoint = endpoint
  @scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
  @scrape_config = JSON.parse(File.read(@scrape_file))
end

Instance Attribute Details

#endpointObject (readonly)

Returns the value of attribute endpoint.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def endpoint
  @endpoint
end

#jobsObject (readonly)

Returns the value of attribute jobs.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def jobs
  @jobs
end

#pagesObject (readonly)

Returns the value of attribute pages.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def pages
  @pages
end

#posted_jobsObject (readonly)

Returns the value of attribute posted_jobs.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def posted_jobs
  @posted_jobs
end

#scrape_configObject (readonly)

Returns the value of attribute scrape_config.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def scrape_config
  @scrape_config
end

#scraped_jobsObject (readonly)

Returns the value of attribute scraped_jobs.



8
9
10
# File 'lib/rubyscraper.rb', line 8

def scraped_jobs
  @scraped_jobs
end

Instance Method Details

#get_bodies(site) ⇒ Object



99
100
101
102
103
104
105
# File 'lib/rubyscraper.rb', line 99

def get_bodies(site)
  jobs.each_with_index do |job, i|
    sleep 1
    pull_job_data(site, job)
    puts "Job #{i+1} pulled."
  end
end

#get_data(site) ⇒ Object



43
44
45
46
47
# File 'lib/rubyscraper.rb', line 43

def get_data(site)
  get_summaries(site)
  get_bodies(site)
  send_to_server
end

#get_summaries(site) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/rubyscraper.rb', line 49

def get_summaries(site)
  if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
    site["summary"]["params"][0]["SEARCHTERM"].each do |term|
      summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
      pagination_start = site["summary"]["pagination_start"].to_i
      pagination_end   = pagination_start + pages - 1
      (pagination_start..pagination_end).to_a.each do |page|
        visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
        all(site["summary"]["loop"]).each do |listing|
          job = pull_summary_data(site, listing)
          job = modify_data(site, job)
          jobs << job
        end
        puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
      end
    end
  else
    summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
    visit summary_url
    all(site["summary"]["loop"]).each do |listing|
      job = pull_summary_data(site, listing)
      job = modify_data(site, job)
      jobs << job
    end
    puts "Pulled #{site["name"]} job summaries."
  end
end

#modify_data(site, job) ⇒ Object



94
95
96
97
# File 'lib/rubyscraper.rb', line 94

def modify_data(site, job)
  job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
  job
end

#pull_job_data(site, job) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/rubyscraper.rb', line 107

def pull_job_data(site, job)
  visit job["url"]
  site["sub_page"]["fields"].each do |field|
    if field["method"] == "all"
      if has_css?(field["path"])
        values = all(field["path"]).map do |elem|
          elem.send(field["loop_collect"])
        end
        job[field["field"]] = values.join(field["join"])
      end
    else
      if has_css?(field["path"])
        job[field["field"]] = 
          send(field["method"].to_sym,field["path"]).text
      end
    end
  end
end

#pull_summary_data(site, listing) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/rubyscraper.rb', line 77

def pull_summary_data(site, listing)
  job = Hash.new
  site["summary"]["fields"].each do |field|
    if field["attr"]
      if listing.has_css?(field["path"])
        job[field["field"]] = 
          listing.send(field["method"].to_sym, field["path"])[field["attr"]]
      end
    else
      if listing.has_css?(field["path"])
        job[field["field"]] = 
          listing.send(field["method"].to_sym, field["path"]).text
      end
    end
  end; job
end

#scrape(single_site = nil) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/rubyscraper.rb', line 25

def scrape(single_site=nil)
  if single_site
    search_site = scrape_config.select { |site| site["name"] == single_site }
    if search_site
      get_data(search_site.first)
    else
      raise "Invalid single site name #{single_site}. Not in scrape file."
    end
  else
    scrape_config.each do |site|
      unless site["skip"] == "true"
        get_data(site)
      end
    end
  end
  return scraped_jobs, posted_jobs
end

#send_to_serverObject



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/rubyscraper.rb', line 126

def send_to_server
  @scraped_jobs += jobs.length
  jobs.each do |job|
    new_job = {
      position: job["position"],
      location: job["location"],
      description: job["description"],
      source: job["url"]
    }

    RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
      case response.code
      when 201
        @posted_jobs += 1
        puts "Job saved."
      when 302
        puts "Job already exists."
      else
        puts "Bad request."
      end
    }
  end
  @jobs = []
end