Class: Tjcrawler::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/tjcrawler/crawler.rb,
lib/tjcrawler/crawler/result.rb

Defined Under Namespace

Classes: Result

Instance Method Summary collapse

Constructor Details

#initialize(css_selector_for_link_tags) ⇒ Crawler

Returns a new instance of Crawler.



8
9
10
# File 'lib/tjcrawler/crawler.rb', line 8

def initialize css_selector_for_link_tags
  @css = css_selector_for_link_tags
end

Instance Method Details

#crawl(url) ⇒ Object



12
13
14
15
16
17
18
# File 'lib/tjcrawler/crawler.rb', line 12

def crawl url
  uri = URI(url).tap(&:normalize!)
  content = open(uri).read
  doc = Nokogiri::HTML(content)
  links = doc.css(@css)
  Result.new url: uri.to_s, content: content, links: links.map{|link| uri.merge(link[:href]).to_s}
end

#startObject



20
21
22
23
24
25
26
27
28
29
# File 'lib/tjcrawler/crawler.rb', line 20

def start
  loop do
    sleep 1 until page = Page.dequeue
    print :'.'
    result = crawl page.url
    page.update(content: result.content)
    page.touch(:crawled_at)
    result.links.each{ |url| Page.enqueue url }
  end
end