Module: Rssdump::Cleaning

Included in:
Scraper
Defined in:
lib/rssdump/cleaning.rb

Constant Summary collapse

URLS =
/https?:\/\/[\S]+/

Instance Method Summary collapse

Instance Method Details

#clean_html(txt) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
# File 'lib/rssdump/cleaning.rb', line 12

def clean_html txt
  if txt
    ensure_valid txt
    c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : ""
    c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || ""
    c = c.gsub(/[\n\r\t ]+/, " ").strip
    c
  else
    ""
  end
end


8
9
10
# File 'lib/rssdump/cleaning.rb', line 8

def clean_link url
  (url || "").strip
end

#ensure_valid(txt) ⇒ Object



28
29
30
31
32
33
34
# File 'lib/rssdump/cleaning.rb', line 28

def ensure_valid txt
  if !txt.valid_encoding?
    txt.force_encoding("ISO-8859-1").encode("UTF-8")
  else
    txt
  end
end

#parse_pub_date(str) ⇒ Object



24
25
26
# File 'lib/rssdump/cleaning.rb', line 24

def parse_pub_date str
  Time.rfc2822 str
end