Class: Scruber::FetcherAdapters::AbstractAdapter

Inherits:
Object
  • Object
show all
Defined in:
lib/scruber/fetcher_adapters/abstract_adapter.rb

Direct Known Subclasses

TyphoeusFetcher

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ AbstractAdapter

Returns a new instance of AbstractAdapter.



13
14
15
16
17
18
19
20
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 13

def initialize(options={})
  @options = options
  @max_concurrency = options.fetch(:max_concurrency) { 1 }
  @max_retry_times = options.fetch(:max_retry_times) { 5 }
  @retry_delays = options.fetch(:retry_delays) { [1,2,2,4,4] }
  @followlocation = options.fetch(:followlocation) { false }
  @request_timeout = options.fetch(:request_timeout) { 15 }
end

Instance Attribute Details

#followlocationObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def followlocation
  @followlocation
end

#max_concurrencyObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def max_concurrency
  @max_concurrency
end

#max_retry_timesObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def max_retry_times
  @max_retry_times
end

#optionsObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def options
  @options
end

#request_timeoutObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def request_timeout
  @request_timeout
end

#retry_delaysObject

all passed options



6
7
8
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 6

def retry_delays
  @retry_delays
end

Instance Method Details

#after_request_callback(page) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 30

def after_request_callback(page)
  if bad_response?(page)
    page.retry_at = determine_retry_at(page)
    page.retry_count += 1
    if page.max_retry_times.nil?
      page.max_retry_times = @max_retry_times
    end
    if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
      page.retry_at = 1.year.from_now.to_i
    end
  else
    # Monkey patch to prevent redownloading of 404 pages
    # and processing 404 pages by regular parsers
    if page.response_code == 404
      page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
      page.max_retry_times = page.retry_count
    else
      page.fetched_at = Time.now.to_i
    end
  end
  if page.response_headers
    page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
  end
  page.response_body = convert_to_utf8(page.response_body)
  page
end

#bad_response?(page) ⇒ Boolean

Returns:

  • (Boolean)


117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 117

def bad_response?(page)
  case page.response_code
  when 0..1
    true
  when 200..299
    false
  when 300..399
    @options.fetch(:followlocation) { false }
  when 404
    false
  when 407
    raise "RejectedByProxy"
  else
    true
  end
end

#before_request_callback(page) ⇒ Object



26
27
28
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 26

def before_request_callback(page)
  page
end

#convert_to_utf8(text) ⇒ Object



57
58
59
60
61
62
63
64
65
66
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 57

def convert_to_utf8(text)
  unless text.to_s.empty?
    detection = CharlockHolmes::EncodingDetector.detect(text)
    if detection && detection[:encoding].present?
      text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
    end
  end

  text
end


83
84
85
86
87
88
89
90
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 83

def cookie_for(page)
  if page.fetcher_agent
    cookie = page.fetcher_agent.cookie_for(page.url)
    cookie.blank? ? nil : cookie
  else
    nil
  end
end

#determine_retry_at(page) ⇒ Object



112
113
114
115
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 112

def determine_retry_at(page)
  delay = @retry_delays[page.retry_count] || @retry_delays.last
  Time.now.to_i + delay
end

#headers_for(page) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 68

def headers_for(page)
  if page.fetcher_agent
    headers = page.fetcher_agent.headers
  else
    headers = page.headers
  end
  headers = {} unless headers.is_a?(Hash)
  headers["User-Agent"] = user_agent_for(page)
  cookie = cookie_for(page)
  if cookie
    headers["Cookie"] = cookie
  end
  headers
end

#proxy_for(page) ⇒ Object



102
103
104
105
106
107
108
109
110
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 102

def proxy_for(page)
  if page.proxy
    page.proxy
  elsif page.fetcher_agent && page.fetcher_agent.proxy
    page.fetcher_agent.proxy
  else
    Scruber::Helpers::ProxyRotator.next
  end
end

#run(queue) ⇒ Object

Raises:

  • (NotImplementedError)


22
23
24
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 22

def run(queue)
  raise NotImplementedError
end

#user_agent_for(page) ⇒ Object



92
93
94
95
96
97
98
99
100
# File 'lib/scruber/fetcher_adapters/abstract_adapter.rb', line 92

def user_agent_for(page)
  if page.user_agent
    page.user_agent
  elsif page.fetcher_agent && page.fetcher_agent.user_agent
    page.fetcher_agent.user_agent
  else
    Scruber::Helpers::UserAgentRotator.next
  end
end