Class: Rssdump::Scraper

Inherits:
Object
  • Object
show all
Includes:
Logging, Cleaning
Defined in:
lib/rssdump/scraper.rb

Constant Summary

Constants included from Cleaning

Cleaning::URLS

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Cleaning

#clean_html, #clean_link, #ensure_valid, #parse_pub_date

Instance Attribute Details

#errorsObject (readonly)

Returns the value of attribute errors.



10
11
12
# File 'lib/rssdump/scraper.rb', line 10

def errors
  @errors
end

Instance Method Details

#scrap(feed, feed_name = "_") ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/rssdump/scraper.rb', line 12

def scrap feed, feed_name = "_"
  @errors = []
	rss = SimpleRSS.parse ensure_valid(open(feed).read)
  rss.items.map do |item|
    begin
      ritem = Item.new
      ritem.title = clean_html(item.title)
      ritem.category = clean_html(item.category)
      ritem.description = clean_html(item.description)
      ritem.pub_date = item.pubDate || item.updated
      ritem.link = clean_link(item.link)
      ritem.feed = feed
      ritem.feed_name = feed_name
      ritem
    rescue => e
      logger.error "An error occurred during cleaning with item #{item.link}."
      logger.error "#{e}\n#{e.backtrace.join("\n")}"
      logger.warn "Ignoring item #{item.link}."
      nil
    end
  end.select do |item|
    !item.nil?
  end
end