Class: LinkedinCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/linkedincrawler.rb

Instance Method Summary collapse

Constructor Details

#initialize(search_terms, retry_limit, requests, requests_google, solver_details) ⇒ LinkedinCrawler

Returns a new instance of LinkedinCrawler.



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/linkedincrawler.rb', line 9

def initialize(search_terms, retry_limit, requests, requests_google, solver_details)
  @search_terms = search_terms
  @output = Array.new
  
  @retry_limit = retry_limit
  @retry_count = 0
  
  @requests = requests
  @requests_google = requests_google
  @solver_details = solver_details
end

Instance Method Details

#check_right_page(profile_url) ⇒ Object

Check that it is actually a LinkedIn profile page



39
40
41
42
43
44
45
# File 'lib/linkedincrawler.rb', line 39

def check_right_page(profile_url)
  return !profile_url.include?("www.google") &&
         profile_url.include?(".linkedin.") &&
         !profile_url.include?("linkedin.com/pub/dir") &&
         !profile_url.include?("/search") &&
         @retry_count < @retry_limit
end

#gen_jsonObject

Print output in JSON



70
71
72
# File 'lib/linkedincrawler.rb', line 70

def gen_json
  JSON.pretty_generate(@output)
end

#scrape(profile_url) ⇒ Object

Scrape each page



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/linkedincrawler.rb', line 48

def scrape(profile_url)
  # Get profile page
  profile_html = @requests.get_page(profile_url)

  # Parse profile and add to output
  begin
    l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
    @output += JSON.parse(l.results_by_job)
    @retry_count = 0
  rescue
    # If proxy doesn't work, try another a few times
    if check_right_page(profile_url)
      @requests.restart_browser
      @retry_count += 1
      scrape(profile_url)
    else
      @retry_count = 0
    end
  end
end

#searchObject

Run search terms and get results



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/linkedincrawler.rb', line 22

def search
  # Run Google search
  g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details)
  urls = g.getURLs

  # Scrape each resulting LinkedIn page
  JSON.parse(urls).each do |profile|
    if check_right_page(profile)
      scrape(profile)
    end
  end

  # Close all the browsers when done
  @requests.close_all_browsers
end