Class: FeedYamlizer::HtmlCleaner
- Inherits:
-
Object
- Object
- FeedYamlizer::HtmlCleaner
- Includes:
- FileUtils::Verbose
- Defined in:
- lib/feed_yamlizer/html_cleaner.rb
Class Method Summary collapse
Instance Method Summary collapse
- #decode_entities ⇒ Object
-
#initialize(html) ⇒ HtmlCleaner
constructor
Takes feed data as hash.
- #output ⇒ Object
- #parse ⇒ Object
Constructor Details
#initialize(html) ⇒ HtmlCleaner
Takes feed data as hash. Generate this with FeedParser
18 19 20 21 22 23 |
# File 'lib/feed_yamlizer/html_cleaner.rb', line 18 def initialize(html) @html = html decode_entities @xml = self.class.tidy(@html) @result = parse.gsub(/<http[^>]+>/, "") end |
Class Method Details
.tidy(html) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/feed_yamlizer/html_cleaner.rb', line 40 def self.tidy(html) # assumes input encoding of latin 1 #output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr| #output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe| #output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1 tidy = "tidy -q -wrap 120 -n -raw -utf8 -asxml 2>/dev/null" output = IO.popen(tidy, "r+") do |pipe| input = " <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n <html xmlns=\"http://www.w3.org/1999/xhtml\">\n <head><title></title></head><body>\#{html}</body></html>\n END\n pipe.puts input\n pipe.close_write\n \#$stderr.puts stderr.read\n pipe.read\n end\n output\nend\n" |
Instance Method Details
#decode_entities ⇒ Object
35 36 37 38 |
# File 'lib/feed_yamlizer/html_cleaner.rb', line 35 def decode_entities coder = HTMLEntities.new coder.decode @html end |
#output ⇒ Object
25 26 27 |
# File 'lib/feed_yamlizer/html_cleaner.rb', line 25 def output @result end |
#parse ⇒ Object
29 30 31 32 33 |
# File 'lib/feed_yamlizer/html_cleaner.rb', line 29 def parse @listener = HtmlListener.new REXML::Document.parse_stream(@xml, @listener) @listener.result + "\n\n" end |