Class: Scruber::QueueAdapters::AbstractAdapter::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/scruber/queue_adapters/abstract_adapter.rb

Overview

Queue page wrapper

Author:

  • Ivan Goncharov

Direct Known Subclasses

Memory::Page

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(queue, options = {}) ⇒ Page

Returns a new instance of Page.



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 60

def initialize(queue, options={})
  @queue = queue

  options = options.with_indifferent_access
  @options = options
  @id = options.fetch(:id) { generate_page_id }
  @url = options.fetch(:url) { raise "URL not provided" }
  @method = options.fetch(:method) { :get }
  @user_agent = options.fetch(:user_agent) { nil }
  @body = options.fetch(:body) { nil }
  @headers = options.fetch(:headers) { {} }
  @fetcher_agent_id = options.fetch(:fetcher_agent_id) { nil }
  @proxy_id = options.fetch(:proxy_id) { nil }
  @response_body = options.fetch(:response_body) { nil }
  @response_code = options.fetch(:response_code) { nil }
  @response_headers = options.fetch(:response_headers) { {} }
  @response_total_time = options.fetch(:response_total_time) { nil }
  @retry_at = options.fetch(:retry_at) { 0 }
  @fetched_at = options.fetch(:fetched_at) { 0 }
  @retry_count = options.fetch(:retry_count) { 0 }
  @max_retry_times = options.fetch(:max_retry_times) { nil }
  @enqueued_at = options.fetch(:enqueued_at) { 0 }
  @page_type = options.fetch(:page_type) { :seed }
  # @queue = options.fetch(:queue) { 'default' }
  @priority = options.fetch(:priority) { 0 }
  @processed_at = options.fetch(:processed_at) { 0 }

  @_fetcher_agent = false
  @_proxy = false
  @_redownload = false
end

Instance Attribute Details

#bodyObject

Returns the value of attribute body.



37
38
39
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 37

def body
  @body
end

#enqueued_atInteger

Timestamp added to the queue

Returns:

  • (Integer)

    the current value of enqueued_at



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def enqueued_at
  @enqueued_at
end

#fetched_atInteger

Download completion timestamp

Returns:

  • (Integer)

    the current value of fetched_at



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def fetched_at
  @fetched_at
end

#fetcher_agent_idObject

ID of FetcherAgent, assigned to this page

Returns:

  • (Object)

    the current value of fetcher_agent_id



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def fetcher_agent_id
  @fetcher_agent_id
end

#headersHash

Headers for requesting this page

Returns:

  • (Hash)

    the current value of headers



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def headers
  @headers
end

#idObject

ID of page. Will be autogenerated if not passed

Returns:

  • (Object)

    the current value of id



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def id
  @id
end

#max_retry_timesInteger

Max number of download attempts

Returns:

  • (Integer)

    the current value of max_retry_times



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def max_retry_times
  @max_retry_times
end

#methodString

Request method, post, get, head

Returns:

  • (String)

    the current value of method



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def method
  @method
end

#optionsHash

All options

Returns:

  • (Hash)

    the current value of options



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def options
  @options
end

#page_typeString

Page type

Returns:

  • (String)

    the current value of page_type



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def page_type
  @page_type
end

#priorityInteger

Priority of page in queue for fetcher

Returns:

  • (Integer)

    the current value of priority



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def priority
  @priority
end

#processed_atInteger

Processed by parser timestamp

Returns:

  • (Integer)

    the current value of processed_at



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def processed_at
  @processed_at
end

#proxy_idObject

ID of proxy, assigned to this page

Returns:

  • (Object)

    the current value of proxy_id



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def proxy_id
  @proxy_id
end

#queueScruber::QueueAdapters::AbstractAdapter::Page

Queue object

Returns:



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def queue
  @queue
end

#response_bodyString

Response body

Returns:

  • (String)

    the current value of response_body



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def response_body
  @response_body
end

#response_codeInteger

Response code

Returns:

  • (Integer)

    the current value of response_code



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def response_code
  @response_code
end

#response_headersHash

Response headers

Returns:

  • (Hash)

    the current value of response_headers



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def response_headers
  @response_headers
end

#response_total_timeFloat

Response total time

Returns:

  • (Float)

    the current value of response_total_time



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def response_total_time
  @response_total_time
end

#retry_atInteger

Minimal timestamp of next retry

Returns:

  • (Integer)

    the current value of retry_at



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def retry_at
  @retry_at
end

#retry_countInteger

Number of download attempts

Returns:

  • (Integer)

    the current value of retry_count



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def retry_count
  @retry_count
end

#urlString

URL of page

Returns:

  • (String)

    the current value of url



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def url
  @url
end

#user_agentString

Fixed User-Agent for requesting this page

Returns:

  • (String)

    the current value of user_agent



36
37
38
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36

def user_agent
  @user_agent
end

Instance Method Details

#[](k) ⇒ Object



147
148
149
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 147

def [](k)
  instance_variable_get("@#{k.to_s}")
end

#deletevoid

This method returns an undefined value.

Delete page from queue

Raises:

  • (NotImplementedError)


155
156
157
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 155

def delete
  raise NotImplementedError
end

#fetcher_agentScruber::Helpers::FetcherAgent

Returns assigned to this page FetcherAgent

Returns:



96
97
98
99
100
101
102
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 96

def fetcher_agent
  if @_fetcher_agent == false
    @_fetcher_agent = (@fetcher_agent_id ? Scruber::Helpers::FetcherAgent.find(@fetcher_agent_id) : nil)
  else
    @_fetcher_agent
  end
end

#processed!void

This method returns an undefined value.

Mark page as processed by parser and save it



163
164
165
166
167
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 163

def processed!
  @processed_at = Time.now.to_i
  @_redownload = false
  save
end

#proxyProxy

Returns assigned to this page proxy

Returns:

  • (Proxy)

    proxy object



108
109
110
111
112
113
114
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 108

def proxy
  if @_proxy == false
    @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
  else
    @_proxy
  end
end

#redownload!(new_retry_count = nil) ⇒ void

This method returns an undefined value.

Mark page as pending and return to queue

Parameters:

  • new_retry_count (Integer) (defaults to: nil)

    new count of reties. Allows to reset retries count



175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 175

def redownload!(new_retry_count=nil)
  @_redownload = true

  @processed_at = 0
  if new_retry_count
    @retry_count = new_retry_count
  else
    @retry_count += 1
  end
  @fetched_at = 0
  @response_body = nil
  save
end

#response_cookiesArray

Returns cookies from response headers

Returns:

  • (Array)

    array of cookies



120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 120

def response_cookies
  cookies = self.response_headers['Set-Cookie']
  if cookies.blank?
    []
  else
    if cookies.is_a?(Array)
      cookies
    else
      [cookies]
    end
  end
end

#saveObject

Raises:

  • (NotImplementedError)


133
134
135
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 133

def save
  raise NotImplementedError
end

#sent_to_redownload?Boolean

Marked as page for redownloading

Returns:

  • (Boolean)

    true if need to redownload



193
194
195
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 193

def sent_to_redownload?
  @_redownload
end

#url_join(link_url) ⇒ String

Join url of current page with another path or url

Parameters:

  • link_url (String)

    link

Returns:

  • (String)

    joined url



143
144
145
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 143

def url_join(link_url)
  URI.join(url, link_url).to_s
end