Module: Rssdump::Cleaning
- Included in:
- Scraper
- Defined in:
- lib/rssdump/cleaning.rb
Constant Summary collapse
- URLS =
/https?:\/\/[\S]+/
Instance Method Summary collapse
- #clean_html(txt) ⇒ Object
- #clean_link(url) ⇒ Object
- #ensure_valid(txt) ⇒ Object
- #parse_pub_date(str) ⇒ Object
Instance Method Details
#clean_html(txt) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/rssdump/cleaning.rb', line 12 def clean_html txt if txt ensure_valid txt c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : "" c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || "" c = c.gsub(/[\n\r\t ]+/, " ").strip c else "" end end |
#clean_link(url) ⇒ Object
8 9 10 |
# File 'lib/rssdump/cleaning.rb', line 8 def clean_link url (url || "").strip end |
#ensure_valid(txt) ⇒ Object
28 29 30 31 32 33 34 |
# File 'lib/rssdump/cleaning.rb', line 28 def ensure_valid txt if !txt.valid_encoding? txt.force_encoding("ISO-8859-1").encode("UTF-8") else txt end end |
#parse_pub_date(str) ⇒ Object
24 25 26 |
# File 'lib/rssdump/cleaning.rb', line 24 def parse_pub_date str Time.rfc2822 str end |