Class: GoogleScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/google_scraper_gem.rb

Constant Summary collapse

BASE_URL =
"http://www.google.com"
SEARCH =
"/search?q="
CSS_SELECTOR =
"li[@class='g'] > div.rc > h3.r > a"
RANK_LIMIT =

Stop looking for the target URL after 100 results

100

Instance Method Summary collapse

Constructor Details

#initializeGoogleScraper

Returns a new instance of GoogleScraper.



11
12
13
14
15
16
17
18
19
20
# File 'lib/google_scraper_gem.rb', line 11

def initialize()
  # proxy = {:host => "204.12.216.84", :port => 20602, :username => 'pp-henryjay', :password => 'rein&true'}
  @mech = Mechanize.new { |agent|
    # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
    agent.user_agent_alias = 'Mac Safari' 
  }
  # @mech.keep_alive = false

  # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
end

Instance Method Details

#checkRank(keyword, url, locale = 'us', language = 'en') ⇒ Object

Returns Rank of URL for keyword in google.com for the specified locale and language.

Returns:

  • Rank of URL for keyword in google.com for the specified locale and language



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/google_scraper_gem.rb', line 23

def checkRank(keyword, url, locale = 'us', language = 'en')
  results = []

  rank_count = 0
  page_num = 1

  uri = BASE_URL + SEARCH + URI.encode(keyword) + "&gl=#{locale}&lr=lang_#{language}"

  page = @mech.get(uri)
  while rank_count < RANK_LIMIT
    # This parse definition requires the Mac Safari user agent.
    page.parser.css(CSS_SELECTOR).each do |cite|
      rank_count += 1

      result = cite.attr('href')
      # puts result

      result.gsub!(%r{^http://}, '')
      result.gsub!(%r{^https://}, '')

      return rank_count if result.start_with?(url) or result.start_with?("www." + url)
    end

    # Get next search result page.
    page_num += 1

    # Add random sleep to prevent blocking of IP.
    # sleep(rand(3..9))

    # TODO: Click "next" instead?
    page = page.link_with(:text => page_num.to_s).click
  end

  return -1
end

#getTopResults(keyword, extension = '.com', top = 10) ⇒ Object

Returns Array of domains in the order specified by Google.

Returns:

  • Array of domains in the order specified by Google.



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/google_scraper_gem.rb', line 60

def getTopResults(keyword, extension = '.com', top = 10)
  results = []

  rank_count = 0
  page_num = 1

  uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)

  page = @mech.get(uri)
  while rank_count < top
    # This parse definition requires the Mac Safari user agent.
    page.parser.css(CSS_SELECTOR).each do |cite|
      rank_count += 1

      result = cite.attr('href')
      puts result
      results << result if result.start_with? 'http'
    end

    return results if rank_count >= top

    # Confirm that there are ten results in this cycle.
    unless rank_count % 10 == 0
      puts "WARNING: There were #{rank_count.to_s} results instead of 10 near page #{page_num} for '#{keyword}'."
    end

    # Get next search result page.
    page_num += 1

    # Add random sleep to prevent blocking of IP.
    # TODO: Replace this with proxy swap.
    # rand(8)

    page = page.link_with(:text => page_num.to_s).click
  end

  results
end