Class: GoogleClient

Inherits:
BaseApiClient show all
Defined in:
lib/whos_using_what/api_clients/google_client.rb

Instance Attribute Summary collapse

Attributes inherited from Base

#set_paths

Instance Method Summary collapse

Methods inherited from BaseApiClient

arraySearch, arry_to_str_delim, cleanup_url, #determineIfUsesTechnology, prepare_params_from_map_helper, starts_with?

Methods inherited from Base

set_paths

Constructor Details

#initializeGoogleClient

Returns a new instance of GoogleClient.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/whos_using_what/api_clients/google_client.rb', line 10

def initialize

  @negativeMatchUrlPatterns = ['google', 'youtube', 'duckduckgo', 'bing', 'yahoo']

  @positiveMatchUrlPatterns = ['http', 'www']

  @technologiesToSearchFor = ['ruby', 'java', 'javascript', 'python']

  @jobPageTokens = ['job', 'hiring', 'career']

  @results = Hash.new

  @mechanize = Mechanize.new

  headless = Headless.new
  headless.start
  @browser = Watir::Browser.new :firefox


end

Instance Attribute Details

#resultsObject (readonly)

Returns the value of attribute results.



8
9
10
# File 'lib/whos_using_what/api_clients/google_client.rb', line 8

def results
  @results
end

Instance Method Details

#extractUrls(rawInput, mustContainUrl) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/whos_using_what/api_clients/google_client.rb', line 32

def extractUrls (rawInput, mustContainUrl)

  acceptedUrls = Array.new

  if (rawInput == nil)
    return acceptedUrls
  end

  urls = []

  begin
    urls = URI.extract(rawInput)
  end

  if urls.size < 1
    return acceptedUrls
  end

  mustContainUrl = BaseApiClient.cleanup_url mustContainUrl

  urls.each do |url|

    url = BaseApiClient.cleanup_url url

    accept_url_bool = false

    @positiveMatchUrlPatterns.each do |token|
      if (BaseApiClient.starts_with? url, token) ||
          (BaseApiClient.starts_with? url, mustContainUrl)
        accept_url_bool = true
        break
      end
    end

    if !accept_url_bool
      next
    end

    if !(url.include? mustContainUrl)
      accept_url_bool = false
    end

    @negativeMatchUrlPatterns.each do |token|
      if url.include? token
        accept_url_bool = false
        break
      end
    end

    url = BaseApiClient.cleanup_url url

    if accept_url_bool &&
        url != nil && !(acceptedUrls.include? url)
      acceptedUrls.push url
    end
  end
  acceptedUrls
end

#perform_search(queries, site_url, search_query_url_generator_closure) ⇒ Object

performs a search engine search that is restricted to a company’s website and then attempts to determine if they have job listings for a given technology. If an ad exists it is returned as part of map



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/whos_using_what/api_clients/google_client.rb', line 95

def perform_search queries, site_url, search_query_url_generator_closure

  ret_map = Hash.new

  @technologiesToSearchFor.each do |search_keyword|

    url = ""
    raw_html = ""

    begin

      url = search_query_url_generator_closure.call( site_url, search_keyword )

      #perform initial search engine search
      @browser.goto url
      raw_html = @browser.html

      puts "successfully queried url:" << url

    rescue Exception => e
      puts "exception:" << e.message << " when querying url: " << url
    end

    urls = extractUrls(raw_html, site_url)

    urls.each do |cur_url|

      begin

        @browser.goto cur_url
        html = @browser.html

        #strip all html tags, for human readability and to cut down on some errors that could arise
        # TODO this was causing an exception
        #  html = html.gsub!(/(<[^>]*>)|\n|\t/s) { " " }

        uses_technology = determineIfUsesTechnology(search_keyword, html)

        if (uses_technology)
          ret_map[search_keyword] = cur_url
        end

      rescue Exception => e

        puts e.message

      end
    end

  end

  #throttle queries to avoid being black-listed by search engine
  sleep_seconds = rand(1-5)
  sleep sleep_seconds

  ret_map

end