Class: Ubi::Aranea

Inherits:
Object
  • Object
show all
Defined in:
lib/ubi/aranea.rb

Overview

Base for araneas (spiders)

Constant Summary collapse

OPTIONS =
{
  user_agent: "Ubi v#{Ubi::VERSION}",
  depth_limit: 3,
  logger: Logger.new(STDOUT),
  # redis_options: {
  #   host: 'localhost',
  #   db: 5,
  #   driver: 'hiredis'
  # },
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(thema, url, opts = {}) ⇒ Aranea

Returns a new instance of Aranea.



18
19
20
21
22
23
24
25
# File 'lib/ubi/aranea.rb', line 18

def initialize(thema, url, opts = {})
  @thema = thema
  @url   = url
  @opts  = OPTIONS.merge(opts)
  @datum = []
  @html = []
  @text = ''
end

Instance Attribute Details

#datumObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def datum
  @datum
end

#htmlObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def html
  @html
end

#optsObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def opts
  @opts
end

#textObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def text
  @text
end

#themaObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def thema
  @thema
end

#urlObject

redis_options:

host: 'localhost',
db: 5,
driver: 'hiredis'

,



16
17
18
# File 'lib/ubi/aranea.rb', line 16

def url
  @url
end

Instance Method Details

#crawl!Object



29
30
31
32
33
34
35
36
37
# File 'lib/ubi/aranea.rb', line 29

def crawl!
  @last_run = Time.now

  puts "Crawler start #{name} #{url}"
  Retriever::PageIterator.new(url, opts) do |page|
    parse page.source
    p [page.title, page.h1, page.h2]
  end
end

#parse(chunk) ⇒ Object



39
40
41
42
43
# File 'lib/ubi/aranea.rb', line 39

def parse(chunk)
  @datum << chunk
  @html << Nokogiri::HTML(chunk)
  @text << html.last.text
end

#to_sObject



50
51
52
# File 'lib/ubi/aranea.rb', line 50

def to_s
  "#{thema} html: #{html.size} txt: #{text.size}"
end

#workObject



45
46
47
48
# File 'lib/ubi/aranea.rb', line 45

def work
  crawl! unless @last_run
  true
end