Class: LinkedinCrawler

Inherits:

Object

Object
LinkedinCrawler

show all

Defined in:: lib/linkedincrawler.rb

Instance Method Summary collapse

#check_right_page(profile_url) ⇒ Object

Check that it is actually a LinkedIn profile page.
#gen_json ⇒ Object

Print output in JSON.
#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ LinkedinCrawler constructor

A new instance of LinkedinCrawler.
#scrape(profile_url) ⇒ Object

Scrape each page.
#search ⇒ Object

Run search terms and get results.

Constructor Details

#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ `LinkedinCrawler`

Returns a new instance of LinkedinCrawler.

# File 'lib/linkedincrawler.rb', line 9

def initialize(search_terms, retry_limit, requests, requests_google, solver_details)
  @search_terms = search_terms
  @output = Array.new
  
  @retry_limit = retry_limit
  @retry_count = 0
  
  @requests = requests
  @requests_google = requests_google
  @solver_details = solver_details
end

Instance Method Details

#check_right_page(profile_url) ⇒ `Object`

Check that it is actually a LinkedIn profile page

# File 'lib/linkedincrawler.rb', line 39

def check_right_page(profile_url)
  return !profile_url.include?("www.google") &&
         profile_url.include?(".linkedin.") &&
         !profile_url.include?("linkedin.com/pub/dir") &&
         !profile_url.include?("/search") &&
         @retry_count < @retry_limit
end

#gen_json ⇒ `Object`

Print output in JSON



70
71
72

# File 'lib/linkedincrawler.rb', line 70

def gen_json
  JSON.pretty_generate(@output)
end

#scrape(profile_url) ⇒ `Object`

Scrape each page

# File 'lib/linkedincrawler.rb', line 48

def scrape(profile_url)
  # Get profile page
  profile_html = @requests.get_page(profile_url)

  # Parse profile and add to output
  begin
    l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
    @output += JSON.parse(l.results_by_job)
    @retry_count = 0
  rescue
    # If proxy doesn't work, try another a few times
    if check_right_page(profile_url)
      @requests.restart_browser
      @retry_count += 1
      scrape(profile_url)
    else
      @retry_count = 0
    end
  end
end

#search ⇒ `Object`

Run search terms and get results

# File 'lib/linkedincrawler.rb', line 22

def search
  # Run Google search
  g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details)
  urls = g.getURLs

  # Scrape each resulting LinkedIn page
  JSON.parse(urls).each do |profile|
    if check_right_page(profile)
      scrape(profile)
    end
  end

  # Close all the browsers when done
  @requests.close_all_browsers
end

Class: LinkedinCrawler

Instance Method Summary collapse

Constructor Details

#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ LinkedinCrawler

Instance Method Details

#check_right_page(profile_url) ⇒ Object

#gen_json ⇒ Object

#scrape(profile_url) ⇒ Object

#search ⇒ Object

#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ `LinkedinCrawler`

#check_right_page(profile_url) ⇒ `Object`

#gen_json ⇒ `Object`

#scrape(profile_url) ⇒ `Object`

#search ⇒ `Object`