Class: FeedYamlizer::HtmlCleaner

Inherits:
Object
  • Object
show all
Includes:
FileUtils::Verbose
Defined in:
lib/feed_yamlizer/html_cleaner.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ HtmlCleaner

Takes feed data as hash. Generate this with FeedParser



18
19
20
21
22
23
# File 'lib/feed_yamlizer/html_cleaner.rb', line 18

def initialize(html)
  @html = html
  decode_entities
  @xml = self.class.tidy(@html)
  @result = parse.gsub(/<http[^>]+>/, "")
end

Class Method Details

.tidy(html) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/feed_yamlizer/html_cleaner.rb', line 40

def self.tidy(html)
  # assumes input encoding of latin 1
  #output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
  #output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
  #output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1

  tidy = "tidy -q -wrap 120 -n -raw -utf8 -asxml 2>/dev/null"
  output = IO.popen(tidy, "r+") do |pipe| 
    input = "  <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n  <html xmlns=\"http://www.w3.org/1999/xhtml\">\n  <head><title></title></head><body>\#{html}</body></html>\n    END\n    pipe.puts input\n    pipe.close_write\n    \#$stderr.puts stderr.read\n    pipe.read\n  end\n  output\nend\n"

Instance Method Details

#decode_entitiesObject



35
36
37
38
# File 'lib/feed_yamlizer/html_cleaner.rb', line 35

def decode_entities
  coder = HTMLEntities.new
  coder.decode @html
end

#outputObject



25
26
27
# File 'lib/feed_yamlizer/html_cleaner.rb', line 25

def output
  @result
end

#parseObject



29
30
31
32
33
# File 'lib/feed_yamlizer/html_cleaner.rb', line 29

def parse
  @listener = HtmlListener.new
  REXML::Document.parse_stream(@xml, @listener)
  @listener.result + "\n\n"
end