Class: Rssdump::Scraper

Inherits:
Object
  • Object
show all
Includes:
Logging, Cleaning
Defined in:
lib/rssdump/scraper.rb

Constant Summary

Constants included from Cleaning

Cleaning::URLS

Instance Method Summary collapse

Methods included from Cleaning

#clean_html, #clean_link, #ensure_valid, #parse_pub_date

Instance Method Details

#scrap(feed) ⇒ Object



10
11
12
# File 'lib/rssdump/scraper.rb', line 10

def scrap feed
  scrap_from_body(open(feed).read, feed)
end

#scrap_from_body(body, feed_url) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/rssdump/scraper.rb', line 14

def scrap_from_body body, feed_url
  @errors = []
	rss = SimpleRSS.parse ensure_valid(body)
  status = :ok
  errors = []
  items = rss.items.map do |item|
    begin
      ritem = Item.new
      ritem.title = clean_html(item.title)
      ritem.category = clean_html(item.category)
      ritem.description = clean_html(item.description)
      ritem.pub_date = item.pubDate || item.updated
      ritem.link = clean_link(item.link)
      ritem.feed = feed_url
      ritem
    rescue => e
      status = :ko
      errors << e
      logger.error "An error occurred during cleaning with item #{item.link}."
      logger.error "#{e}\n#{e.backtrace.join("\n")}"
      logger.warn "Ignoring item #{item.link}."
      nil
    end
  end.select do |item|
    !item.nil?
  end
  {
    status: status,
    errors: errors,
    items: items
  }
end