Class: LinkedinCrawler
- Inherits:
-
Object
- Object
- LinkedinCrawler
- Defined in:
- lib/linkedincrawler.rb
Instance Method Summary collapse
-
#check_right_page(profile_url) ⇒ Object
Check that it is actually a LinkedIn profile page.
-
#gen_json ⇒ Object
Print output in JSON.
-
#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ LinkedinCrawler
constructor
A new instance of LinkedinCrawler.
-
#scrape(profile_url) ⇒ Object
Scrape each page.
-
#search ⇒ Object
Run search terms and get results.
Constructor Details
#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ LinkedinCrawler
Returns a new instance of LinkedinCrawler.
9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/linkedincrawler.rb', line 9 def initialize(search_terms, retry_limit, requests, requests_google, solver_details) @search_terms = search_terms @output = Array.new @retry_limit = retry_limit @retry_count = 0 @requests = requests @requests_google = requests_google @solver_details = solver_details end |
Instance Method Details
#check_right_page(profile_url) ⇒ Object
Check that it is actually a LinkedIn profile page
39 40 41 42 43 44 45 |
# File 'lib/linkedincrawler.rb', line 39 def check_right_page(profile_url) return !profile_url.include?("www.google") && profile_url.include?(".linkedin.") && !profile_url.include?("linkedin.com/pub/dir") && !profile_url.include?("/search") && @retry_count < @retry_limit end |
#gen_json ⇒ Object
Print output in JSON
70 71 72 |
# File 'lib/linkedincrawler.rb', line 70 def gen_json JSON.pretty_generate(@output) end |
#scrape(profile_url) ⇒ Object
Scrape each page
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/linkedincrawler.rb', line 48 def scrape(profile_url) # Get profile page profile_html = @requests.get_page(profile_url) # Parse profile and add to output begin l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms}) @output += JSON.parse(l.results_by_job) @retry_count = 0 rescue # If proxy doesn't work, try another a few times if check_right_page(profile_url) @requests.restart_browser @retry_count += 1 scrape(profile_url) else @retry_count = 0 end end end |
#search ⇒ Object
Run search terms and get results
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/linkedincrawler.rb', line 22 def search # Run Google search g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details) urls = g.getURLs # Scrape each resulting LinkedIn page JSON.parse(urls).each do |profile| if check_right_page(profile) scrape(profile) end end # Close all the browsers when done @requests.close_all_browsers end |