Class: Ubi::Aranea
- Inherits:
-
Object
- Object
- Ubi::Aranea
- Defined in:
- lib/ubi/aranea.rb
Overview
Base for araneas (spiders)
Constant Summary collapse
- OPTIONS =
{ user_agent: "Ubi v#{Ubi::VERSION}", depth_limit: 3, logger: Logger.new(STDOUT), # redis_options: { # host: 'localhost', # db: 5, # driver: 'hiredis' # }, }
Instance Attribute Summary collapse
-
#datum ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
-
#html ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
-
#opts ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
-
#text ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
-
#thema ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
-
#url ⇒ Object
redis_options: { host: ‘localhost’, db: 5, driver: ‘hiredis’ },.
Instance Method Summary collapse
- #crawl! ⇒ Object
-
#initialize(thema, url, opts = {}) ⇒ Aranea
constructor
A new instance of Aranea.
- #parse(chunk) ⇒ Object
- #to_s ⇒ Object
- #work ⇒ Object
Constructor Details
Instance Attribute Details
#datum ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def datum @datum end |
#html ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def html @html end |
#opts ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def opts @opts end |
#text ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def text @text end |
#thema ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def thema @thema end |
#url ⇒ Object
redis_options:
host: 'localhost',
db: 5,
driver: 'hiredis'
,
16 17 18 |
# File 'lib/ubi/aranea.rb', line 16 def url @url end |
Instance Method Details
#crawl! ⇒ Object
29 30 31 32 33 34 35 36 37 |
# File 'lib/ubi/aranea.rb', line 29 def crawl! @last_run = Time.now puts "Crawler start #{name} #{url}" Retriever::PageIterator.new(url, opts) do |page| parse page.source p [page.title, page.h1, page.h2] end end |
#parse(chunk) ⇒ Object
39 40 41 42 43 |
# File 'lib/ubi/aranea.rb', line 39 def parse(chunk) @datum << chunk @html << Nokogiri::HTML(chunk) @text << html.last.text end |
#to_s ⇒ Object
50 51 52 |
# File 'lib/ubi/aranea.rb', line 50 def to_s "#{thema} html: #{html.size} txt: #{text.size}" end |
#work ⇒ Object
45 46 47 48 |
# File 'lib/ubi/aranea.rb', line 45 def work crawl! unless @last_run true end |