Class: Apollo::Crawler::SpiderCrawler

Inherits:
BaseCrawler show all
Defined in:
lib/apollo_crawler/crawler/spider_crawler.rb

Instance Method Summary collapse

Methods inherited from BaseCrawler

create_metadoc, #enqueue_url, #etl, fetch, #fetch_document, #initialize, name_re, #process_url, try_get_doc, try_get_url, #url_processed?

Constructor Details

This class inherits a constructor from Apollo::Crawler::BaseCrawler

Instance Method Details

#extract_data(doc) ⇒ Object



34
35
36
# File 'lib/apollo_crawler/crawler/spider_crawler.rb', line 34

def extract_data(doc)
  []
end


38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/apollo_crawler/crawler/spider_crawler.rb', line 38

def extract_links(doc)
  res = doc.xpath("//a").map { |node|
    url = BaseCrawler.try_get_url(self.url, node['href']).to_s
    next if url.nil?

    { 
      :link => url
    }
  }
  
  return res.uniq
end

#nameObject



26
27
28
# File 'lib/apollo_crawler/crawler/spider_crawler.rb', line 26

def name()
  return "Spider"
end

#urlObject



30
31
32
# File 'lib/apollo_crawler/crawler/spider_crawler.rb', line 30

def url()
  return "http://www.wikipedia.org/"
end