Class: Scruber::FetcherAdapters::AbstractAdapter
- Defined in:
- lib/scruber/fetcher_adapters/abstract_adapter.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#followlocation ⇒ Object
all passed options.
-
#max_concurrency ⇒ Object
all passed options.
-
#max_retry_times ⇒ Object
all passed options.
-
#options ⇒ Object
all passed options.
-
#request_timeout ⇒ Object
all passed options.
-
#retry_delays ⇒ Object
all passed options.
Instance Method Summary collapse
- #after_request_callback(page) ⇒ Object
- #bad_response?(page) ⇒ Boolean
- #before_request_callback(page) ⇒ Object
- #cookie_for(page) ⇒ Object
- #determine_retry_at(page) ⇒ Object
- #headers_for(page) ⇒ Object
-
#initialize(options = {}) ⇒ AbstractAdapter
constructor
A new instance of AbstractAdapter.
- #proxy_for(page) ⇒ Object
- #run(queue) ⇒ Object
- #user_agent_for(page) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ AbstractAdapter
Returns a new instance of AbstractAdapter.
11 12 13 14 15 16 17 18 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 11 def initialize(={}) @options = @max_concurrency = .fetch(:max_concurrency) { 1 } @max_retry_times = .fetch(:max_retry_times) { 5 } @retry_delays = .fetch(:retry_delays) { [1,2,2,4,4] } @followlocation = .fetch(:followlocation) { false } @request_timeout = .fetch(:request_timeout) { 15 } end |
Instance Attribute Details
#followlocation ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def followlocation @followlocation end |
#max_concurrency ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def max_concurrency @max_concurrency end |
#max_retry_times ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def max_retry_times @max_retry_times end |
#options ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def @options end |
#request_timeout ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def request_timeout @request_timeout end |
#retry_delays ⇒ Object
all passed options
4 5 6 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 4 def retry_delays @retry_delays end |
Instance Method Details
#after_request_callback(page) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 28 def after_request_callback(page) if bad_response?(page) page.retry_at = determine_retry_at(page) page.retry_count += 1 if page.max_retry_times.nil? page.max_retry_times = @max_retry_times end if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i page.retry_at = 1.year.from_now.to_i end else # Monkey patch to prevent redownloading of 404 pages # and processing 404 pages by regular parsers if page.response_code == 404 page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero? page.max_retry_times = page.retry_count else page.fetched_at = Time.now.to_i end end page end |
#bad_response?(page) ⇒ Boolean
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 100 def bad_response?(page) case page.response_code when 0..1 true when 200..299 false when 300..399 @options.fetch(:followlocation) { false } when 404 false when 407 raise "RejectedByProxy" else true end end |
#before_request_callback(page) ⇒ Object
24 25 26 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 24 def before_request_callback(page) page end |
#cookie_for(page) ⇒ Object
66 67 68 69 70 71 72 73 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 66 def (page) if page.fetcher_agent = page.fetcher_agent.(page.url) .blank? ? nil : else nil end end |
#determine_retry_at(page) ⇒ Object
95 96 97 98 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 95 def determine_retry_at(page) delay = @retry_delays[page.retry_count] || @retry_delays.last Time.now.to_i + delay end |
#headers_for(page) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 51 def headers_for(page) if page.fetcher_agent headers = page.fetcher_agent.headers else headers = page.headers end headers = {} unless headers.is_a?(Hash) headers["User-Agent"] = user_agent_for(page) = (page) if headers["Cookie"] = end headers end |
#proxy_for(page) ⇒ Object
85 86 87 88 89 90 91 92 93 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 85 def proxy_for(page) if page.proxy page.proxy elsif page.fetcher_agent && page.fetcher_agent.proxy page.fetcher_agent.proxy else Scruber::Helpers::ProxyRotator.next end end |
#run(queue) ⇒ Object
20 21 22 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 20 def run(queue) raise NotImplementedError end |
#user_agent_for(page) ⇒ Object
75 76 77 78 79 80 81 82 83 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 75 def user_agent_for(page) if page.user_agent page.user_agent elsif page.fetcher_agent && page.fetcher_agent.user_agent page.fetcher_agent.user_agent else Scruber::Helpers::UserAgentRotator.next end end |