Class: Apollo::Crawler::HackerNewsCrawler

Inherits:
BaseCrawler
  • Object
show all
Defined in:
lib/apollo_crawler/crawler/hacker_news_crawler.rb

Constant Summary collapse

@@MATCHER_ITEM =
"(//td[@class = 'title']/a)[not(position() > last() -1)]"

Instance Method Summary collapse

Methods inherited from BaseCrawler

create_metadoc, #enqueue_url, #etl, fetch, #fetch_document, #initialize, name_re, #process_url, try_get_doc, try_get_url, #url_processed?

Constructor Details

This class inherits a constructor from Apollo::Crawler::BaseCrawler

Instance Method Details

#extract_data(doc) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/apollo_crawler/crawler/hacker_news_crawler.rb', line 36

def extract_data(doc)
	res = doc.xpath(@@MATCHER_ITEM).map { |node|
		url = BaseCrawler.try_get_url(self.url, node['href']).to_s
		next if url.nil?

		{ 
			:text => node.text,
			:link => url
		}
	}

	return res
end


50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/apollo_crawler/crawler/hacker_news_crawler.rb', line 50

def extract_links(doc)
	res = doc.xpath("(//td[@class = 'title']/a)[last()]").map { |node|
		url = BaseCrawler.try_get_url(self.url, node['href']).to_s
		next if url.nil?

		{ 
			:link => url
		}
	}
	
	return res.uniq
end

#nameObject



28
29
30
# File 'lib/apollo_crawler/crawler/hacker_news_crawler.rb', line 28

def name()
	return "Hacker News"
end

#urlObject



32
33
34
# File 'lib/apollo_crawler/crawler/hacker_news_crawler.rb', line 32

def url()
	return "http://news.ycombinator.com/"
end