Class: Scruber::FetcherAdapters::AbstractAdapter

Inherits:
Object
  • Object
show all
Defined in:
lib/scruber/fetcher_adapters/abstract_adapter.rb

Direct Known Subclasses

TyphoeusFetcher

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ AbstractAdapter

Returns a new instance of AbstractAdapter.



11
12
13
14
15
16
17
18
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 11

def initialize(options={})
  @options = options
  @max_concurrency = options.fetch(:max_concurrency) { 1 }
  @max_retry_times = options.fetch(:max_retry_times) { 5 }
  @retry_delays = options.fetch(:retry_delays) { [1,2,2,4,4] }
  @followlocation = options.fetch(:followlocation) { false }
  @request_timeout = options.fetch(:request_timeout) { 15 }
end

Instance Attribute Details

#followlocationObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def followlocation
  @followlocation
end

#max_concurrencyObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def max_concurrency
  @max_concurrency
end

#max_retry_timesObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def max_retry_times
  @max_retry_times
end

#optionsObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def options
  @options
end

#request_timeoutObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def request_timeout
  @request_timeout
end

#retry_delaysObject

all passed options



4
5
6
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4

def retry_delays
  @retry_delays
end

Instance Method Details

#after_request_callback(page) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 28

def after_request_callback(page)
  if bad_response?(page)
    page.retry_at = determine_retry_at(page)
    page.retry_count += 1
    if page.max_retry_times.nil?
      page.max_retry_times = @max_retry_times
    end
    if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
      page.retry_at = 1.year.from_now.to_i
    end
  else
    # Monkey patch to prevent redownloading of 404 pages
    # and processing 404 pages by regular parsers
    if page.response_code == 404
      page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
      page.max_retry_times = page.retry_count
    else
      page.fetched_at = Time.now.to_i
    end
  end
  page
end

#bad_response?(page) ⇒ Boolean

Returns:

  • (Boolean)


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 100

def bad_response?(page)
  case page.response_code
  when 0..1
    true
  when 200..299
    false
  when 300..399
    @options.fetch(:followlocation) { false }
  when 404
    false
  when 407
    raise "RejectedByProxy"
  else
    true
  end
end

#before_request_callback(page) ⇒ Object



24
25
26
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 24

def before_request_callback(page)
  page
end


66
67
68
69
70
71
72
73
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 66

def cookie_for(page)
  if page.fetcher_agent
    cookie = page.fetcher_agent.cookie_for(page.url)
    cookie.blank? ? nil : cookie
  else
    nil
  end
end

#determine_retry_at(page) ⇒ Object



95
96
97
98
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 95

def determine_retry_at(page)
  delay = @retry_delays[page.retry_count] || @retry_delays.last
  Time.now.to_i + delay
end

#headers_for(page) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 51

def headers_for(page)
  if page.fetcher_agent
    headers = page.fetcher_agent.headers
  else
    headers = page.headers
  end
  headers = {} unless headers.is_a?(Hash)
  headers["User-Agent"] = user_agent_for(page)
  cookie = cookie_for(page)
  if cookie
    headers["Cookie"] = cookie
  end
  headers
end

#proxy_for(page) ⇒ Object



85
86
87
88
89
90
91
92
93
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 85

def proxy_for(page)
  if page.proxy
    page.proxy
  elsif page.fetcher_agent && page.fetcher_agent.proxy
    page.fetcher_agent.proxy
  else
    Scruber::Helpers::ProxyRotator.next
  end
end

#run(queue) ⇒ Object

Raises:

  • (NotImplementedError)


20
21
22
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 20

def run(queue)
  raise NotImplementedError
end

#user_agent_for(page) ⇒ Object



75
76
77
78
79
80
81
82
83
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 75

def user_agent_for(page)
  if page.user_agent
    page.user_agent
  elsif page.fetcher_agent && page.fetcher_agent.user_agent
    page.fetcher_agent.user_agent
  else
    Scruber::Helpers::UserAgentRotator.next
  end
end