Class: GScraper::Search::WebQuery

Inherits:
Query
  • Object
show all
Includes:
HasPages
Defined in:
lib/gscraper/search/web_query.rb

Constant Summary collapse

PATH =

Web Search path

'/search'
RESULTS_PER_PAGE =

Default results per-page

10
LICENSES =

Web Search licenses

{
  '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
  '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
  '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
  '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
}

Constants inherited from Query

Query::DEFAULT_HOST, Query::SUB_DOMAIN

Instance Attribute Summary collapse

Attributes inherited from Query

#allintext, #allintitle, #allinurl, #define, #exact_phrase, #filetype, #info, #intext, #intitle, #inurl, #language, #link, #numeric_range, #query, #related, #search_host, #site, #with_words, #without_words

Class Method Summary collapse

Instance Method Summary collapse

Methods included from HasPages

#[], #each, #each_on_page, #each_on_pages, #each_page, #first_page, #page_cache, #page_index_of, #pages, #result_index_of, #result_offset_of

Methods inherited from Query

#expression, #format_modifier, #format_options

Constructor Details

#initialize(options = {}) {|query| ... } ⇒ WebQuery

Creates a new Web query.

Examples:

WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
  q.within_past_week = true
end

Parameters:

  • options (Hash) (defaults to: {})

    Additional options.

Options Hash (options):

  • :search_host (String) — default: www.google.com

    The host to submit queries to.

  • :results_per_page (Integer)

    Specifies the number of results for each page.

  • :language (String, Symbol) — default: Languages.native

    Search for results in the specified language.

  • :region (String)

    Search for results from the specified region.

  • :within_past_day (Boolean)

    Search for results that were created within the past day.

  • :within_past_week (Boolean)

    Search for results that were created within the past week.

  • :within_past_month (Boolean)

    Search for results that were created within the past month.

  • :within_past_year (Boolean)

    Search for results that were created within the past year.

  • :occurs_within (:title, :body, :url)

    Searches for results where the keywords occurr within a specific part of the result page.

  • :rights (Symbol)

    Search for results licensed under the specified license.

  • :filtered (Boolean)

    Specifies whether or not to use SafeSearch.

Yields:

  • (query)

    If a block is given, it will be passed the new Web query.

Yield Parameters:

  • query (WebQuery)

    The new Web query.



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/gscraper/search/web_query.rb', line 149

def initialize(options={},&block)
  @agent = GScraper.web_agent(options)

  @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)

  @region = options[:region]

  if options[:within_past_day]
    @within_past_day    = options[:within_past_day]
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = false
  elsif options[:within_past_week]
    @within_past_day    = false
    @within_past_week   = options[:within_past_week]
    @within_past_months = false
    @within_past_year   = false
  elsif options[:within_past_months]
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = options[:within_past_months]
    @within_past_year   = false
  elsif options[:within_past_year]
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = options[:within_past_year]
  else
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = false
  end

  @occurs_within = options[:occurs_within]
  @rights        = options[:rights]
  @filtered      = options[:filtered]

  super(options,&block)
end

Instance Attribute Details

#filteredObject

Filter the search results



90
91
92
# File 'lib/gscraper/search/web_query.rb', line 90

def filtered
  @filtered
end

#in_formatObject

Search for results in the format



60
61
62
# File 'lib/gscraper/search/web_query.rb', line 60

def in_format
  @in_format
end

#inside_domainObject

Search for results inside the domain



81
82
83
# File 'lib/gscraper/search/web_query.rb', line 81

def inside_domain
  @inside_domain
end

#not_in_formatObject

Search for results not in the format



63
64
65
# File 'lib/gscraper/search/web_query.rb', line 63

def not_in_format
  @not_in_format
end

#occurs_withinObject

Search for results where the query occurs within the area



78
79
80
# File 'lib/gscraper/search/web_query.rb', line 78

def occurs_within
  @occurs_within
end

#outside_domainObject

Search for results outside the domain



84
85
86
# File 'lib/gscraper/search/web_query.rb', line 84

def outside_domain
  @outside_domain
end

#regionObject

Search for results from the region



57
58
59
# File 'lib/gscraper/search/web_query.rb', line 57

def region
  @region
end

#results_per_pageObject

Results per-page



54
55
56
# File 'lib/gscraper/search/web_query.rb', line 54

def results_per_page
  @results_per_page
end

#rightsObject

Search for results which have the rights



87
88
89
# File 'lib/gscraper/search/web_query.rb', line 87

def rights
  @rights
end

#within_past_dayObject

Search for results within the past day



66
67
68
# File 'lib/gscraper/search/web_query.rb', line 66

def within_past_day
  @within_past_day
end

#within_past_monthsObject

Search for results within the past months



72
73
74
# File 'lib/gscraper/search/web_query.rb', line 72

def within_past_months
  @within_past_months
end

#within_past_weekObject

Search for results within the past week



69
70
71
# File 'lib/gscraper/search/web_query.rb', line 69

def within_past_week
  @within_past_week
end

#within_past_yearObject

Search for results within the past year



75
76
77
# File 'lib/gscraper/search/web_query.rb', line 75

def within_past_year
  @within_past_year
end

Class Method Details

.from_url(url, options = {}) {|query| ... } ⇒ WebQuery

Creates a new Web query from a search URL.

Examples:

WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
  q.within_last_month = true
  q.occurs_within = :title
end

Parameters:

  • url (URI::HTTP, String)

    The search URL.

  • options (Hash) (defaults to: {})

    Additional options.

Yields:

  • (query)

    If a block is given, it will be passed the new Web query.

Yield Parameters:

  • query (WebQuery)

    The new web query.

Returns:



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/gscraper/search/web_query.rb', line 223

def WebQuery.from_url(url,options={},&block)
  url = URI(url.to_s)

  options[:search_host] = url.host

  options[:results_per_page] = if url.query_params['num']
                                 url.query_params['num'].to_i
                               else
                                 RESULTS_PER_PAGE
                               end

  options[:query]         = url.query_params['q']
  options[:exact_phrase]  = url.query_params['as_epq']
  options[:with_words]    = url.query_params['as_oq']
  options[:without_words] = url.query_params['as_eq']

  options[:language] = url.query_params['lr']
  options[:region]   = url.query_params['cr']

  if url.query_params['as_filetype']
    options[:filetype] = url.query_params['as_filetype']
  end

  case url.query_params['as_qdr']
  when 'd'
    options[:within_past_day] = true
  when 'w'
    options[:within_past_week] = true
  when 'm'
    options[:within_past_months] = 1
  when 'm2'
    options[:within_past_months] = 2
  when 'm3'
    options[:within_past_months] = 3
  when 'm6'
    options[:within_past_months] = 6
  when 'y'
    options[:within_past_year] = true
  end

  if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
    options[:numeric_range] = Range.new(
      url.query_params['as_nlo'].to_i,
      url.query_params['as_nhi'].to_i
    )
  end

  if url.query_params['as_occt']
    options[:occurs_within] = url.query_params['as_occt'].to_sym
  end

  options[:site] = url.query_params['as_sitesearch']

  options[:rights] = LICENSES[url.query_params['as_rights']]
  options[:filtered] = (url.query_params[:safe] == 'active')

  if url.query_params['as_rq']
    options[:related] = url.query_params['as_rq']
  elsif url.query_params['as_lq']
    options[:link] = url.query_params['as_lq']
  end

  return WebQuery.new(options,&block)
end

Instance Method Details

Iterates over the sponsored ads on the first page.

Yields:

  • (ad)

    The given block will be passed each sponsored ad.

Yield Parameters:

  • ad (SponsoredAd)

    A sponsored ad on the first page.

Returns:

  • (Enumerator)

    If no block is given, an Enumerator object will be returned.



499
500
501
# File 'lib/gscraper/search/web_query.rb', line 499

def each_sponsored_link(&block)
  sponsored_links.each(&block)
end

#page(page_index) ⇒ Page<Result>

Returns a page containing results at the specific page index.

Parameters:

  • page_index (Integer)

    The page index to query.

Returns:

  • (Page<Result>)

    The page at the given index for the query.



387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# File 'lib/gscraper/search/web_query.rb', line 387

def page(page_index)
  Page.new do |new_page|
    doc = @agent.get(page_url(page_index))

    if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
      raise(Blocked,"Google has temporarily blocked our IP Address",caller)
    end

    results        = doc.search('//li[@class="g"]')
    results_length = [@results_per_page, results.length].min

    rank_offset = result_offset_of(page_index)

    results_length.times do |index|
      result   = results[index]
      rank     = rank_offset + (index + 1)
      link     = result.at('.//h3/a')
      title    = link.inner_text
      link_url = URI(link.get_attribute('href')).query_params['q']
      url      = URI(link_url)
      
      summary_text = ''

      if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
        content.children.each do |elem|
          break if (!(elem.text?) && elem.name=='br')

          summary_text << elem.inner_text
        end

      end

      cached_url  = nil
      similar_url = nil

      if (gl = result.at('.//div[@class="s"]'))
        if (cached_link = gl.at('.//a[1]'))
          cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
        end

        if (similar_link = gl.at('.//a[2]'))
          similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
        end
      end

      new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
    end
  end
end

#page_url(page_index) ⇒ URI::HTTP

Returns the URL that represents the query at a specific page index.

Parameters:

  • page_index (Integer)

    The page index to create the URL for.

Returns:

  • (URI::HTTP)

    The URL for a query at the given page index.



369
370
371
372
373
374
375
376
# File 'lib/gscraper/search/web_query.rb', line 369

def page_url(page_index)
  url = search_url

  url.query_params['start'] = result_offset_of(page_index)
  url.query_params['sa']    = 'N'

  return url
end

#result_at(index) ⇒ Object

Returns the result at the specified index.

Parameters:

  • The (Integer)

    index of the result.



453
454
455
# File 'lib/gscraper/search/web_query.rb', line 453

def result_at(index)
  page(page_index_of(index))[result_index_of(index)]
end

#search_urlURI::HTTP

The URL that represents the query.

Returns:

  • (URI::HTTP)

    The URL for the query.



294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
# File 'lib/gscraper/search/web_query.rb', line 294

def search_url
  url = URI::HTTP.build(:host => search_host, :path => PATH)

  set_param = lambda { |param,value|
    url.query_params[param.to_s] = value if value
  }

  set_param.call('num',@results_per_page)
  set_param.call('q',expression)
  set_param.call('as_epq',@exact_phrase)
  set_param.call('as_oq',@with_words)
  set_param.call('as_eq',@without_words)

  set_param.call('lr',@language)
  set_param.call('cr',@region)

  set_param.call('as_filetype',@filetype)

  if @within_past_day
    url.query_params['as_qdr'] = 'd'
  elsif @within_past_week
    url.query_params['as_qdr'] = 'w'
  elsif @within_past_months
    case @within_past_months
    when 1
      url.query_params['as_qdr'] = 'm'
    when 2
      url.query_params['as_qdr'] = 'm2'
    when 3
      url.query_params['as_qdr'] = 'm3'
    when 6
      url.query_params['as_qdr'] = 'm6'
    end
  elsif @within_past_year
    url.query_params['as_qdr'] = 'y'
  end

  if @numeric_range.kind_of?(Range)
    url.query_params['as_nlo'] = @numeric_range.begin
    url.query_params['as_nhi'] = @numeric_range.end
  end

  case @occurs_within
  when :title, 'title'
    url.query_params['as_occt'] = 'title'
  when :body, 'body'
    url.query_params['as_occt'] = 'body'
  when :url, 'url'
    url.query_params['as_occt'] = 'url'
  when :links, 'links'
    url.query_params['as_occt'] = 'links'
  end

  set_param.call('as_sitesearch',@site)

  if @rights
    url.query_params['as_rights'] = LICENSES.reverse[@rights]
  end

  if @filtered
    url.query_params['safe'] = 'active'
  end

  return url
end

Returns the sponsored links for the query.

Returns:



463
464
465
466
467
468
469
470
471
472
473
474
475
# File 'lib/gscraper/search/web_query.rb', line 463

def sponsored_links
  SponsoredLinks.new do |links|
    doc = @agent.get(search_url)

    # top and side ads
    doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
      title = link.inner_text
      url   = URI("http://#{search_host}" + link.get_attribute('href'))

      links << SponsoredAd.new(title,url)
    end
  end
end

#top_resultResult

Returns the first result on the first page.

Returns:

  • (Result)

    The first result.



443
444
445
# File 'lib/gscraper/search/web_query.rb', line 443

def top_result
  first_page.first
end

Returns the first sponsored ad on the first page of results.

Returns:



483
484
485
# File 'lib/gscraper/search/web_query.rb', line 483

def top_sponsored_link
  top_sponsored_links.first
end