Class: Scruber::FetcherAdapters::AbstractAdapter
- Defined in:
- lib/scruber/fetcher_adapters/abstract_adapter.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#followlocation ⇒ Object
all passed options.
-
#max_concurrency ⇒ Object
all passed options.
-
#max_retry_times ⇒ Object
all passed options.
-
#options ⇒ Object
all passed options.
-
#request_timeout ⇒ Object
all passed options.
-
#retry_delays ⇒ Object
all passed options.
Instance Method Summary collapse
- #after_request_callback(page) ⇒ Object
- #bad_response?(page) ⇒ Boolean
- #before_request_callback(page) ⇒ Object
- #convert_to_utf8(text) ⇒ Object
- #cookie_for(page) ⇒ Object
- #determine_retry_at(page) ⇒ Object
- #headers_for(page) ⇒ Object
-
#initialize(options = {}) ⇒ AbstractAdapter
constructor
A new instance of AbstractAdapter.
- #proxy_for(page) ⇒ Object
- #run(queue) ⇒ Object
- #user_agent_for(page) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ AbstractAdapter
Returns a new instance of AbstractAdapter.
13 14 15 16 17 18 19 20 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 13 def initialize(={}) @options = @max_concurrency = .fetch(:max_concurrency) { 1 } @max_retry_times = .fetch(:max_retry_times) { 5 } @retry_delays = .fetch(:retry_delays) { [1,2,2,4,4] } @followlocation = .fetch(:followlocation) { false } @request_timeout = .fetch(:request_timeout) { 15 } end |
Instance Attribute Details
#followlocation ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def followlocation @followlocation end |
#max_concurrency ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def max_concurrency @max_concurrency end |
#max_retry_times ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def max_retry_times @max_retry_times end |
#options ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def @options end |
#request_timeout ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def request_timeout @request_timeout end |
#retry_delays ⇒ Object
all passed options
6 7 8 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6 def retry_delays @retry_delays end |
Instance Method Details
#after_request_callback(page) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 30 def after_request_callback(page) if bad_response?(page) page.retry_at = determine_retry_at(page) page.retry_count += 1 if page.max_retry_times.nil? page.max_retry_times = @max_retry_times end if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i page.retry_at = 1.year.from_now.to_i end else # Monkey patch to prevent redownloading of 404 pages # and processing 404 pages by regular parsers if page.response_code == 404 page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero? page.max_retry_times = page.retry_count else page.fetched_at = Time.now.to_i end end if page.response_headers page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc } end page.response_body = convert_to_utf8(page.response_body) page end |
#bad_response?(page) ⇒ Boolean
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 117 def bad_response?(page) case page.response_code when 0..1 true when 200..299 false when 300..399 @options.fetch(:followlocation) { false } when 404 false when 407 raise "RejectedByProxy" else true end end |
#before_request_callback(page) ⇒ Object
26 27 28 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 26 def before_request_callback(page) page end |
#convert_to_utf8(text) ⇒ Object
57 58 59 60 61 62 63 64 65 66 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 57 def convert_to_utf8(text) unless text.to_s.empty? detection = CharlockHolmes::EncodingDetector.detect(text) if detection && detection[:encoding].present? text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text end end text end |
#cookie_for(page) ⇒ Object
83 84 85 86 87 88 89 90 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 83 def (page) if page.fetcher_agent = page.fetcher_agent.(page.url) .blank? ? nil : else nil end end |
#determine_retry_at(page) ⇒ Object
112 113 114 115 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 112 def determine_retry_at(page) delay = @retry_delays[page.retry_count] || @retry_delays.last Time.now.to_i + delay end |
#headers_for(page) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 68 def headers_for(page) if page.fetcher_agent headers = page.fetcher_agent.headers else headers = page.headers end headers = {} unless headers.is_a?(Hash) headers["User-Agent"] = user_agent_for(page) = (page) if headers["Cookie"] = end headers end |
#proxy_for(page) ⇒ Object
102 103 104 105 106 107 108 109 110 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 102 def proxy_for(page) if page.proxy page.proxy elsif page.fetcher_agent && page.fetcher_agent.proxy page.fetcher_agent.proxy else Scruber::Helpers::ProxyRotator.next end end |
#run(queue) ⇒ Object
22 23 24 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 22 def run(queue) raise NotImplementedError end |
#user_agent_for(page) ⇒ Object
92 93 94 95 96 97 98 99 100 |
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 92 def user_agent_for(page) if page.user_agent page.user_agent elsif page.fetcher_agent && page.fetcher_agent.user_agent page.fetcher_agent.user_agent else Scruber::Helpers::UserAgentRotator.next end end |