Class: FeedYamlizer

Inherits:
Object
  • Object
show all
Includes:
FileUtils::Verbose
Defined in:
lib/feed_yamlizer/html_cleaner.rb,
lib/feed_yamlizer.rb,
lib/feed_yamlizer/version.rb,
lib/feed_yamlizer/feed_parser.rb,
lib/feed_yamlizer/feed_listener.rb,
lib/feed_yamlizer/html_listener.rb

Overview

NOTE requires the htmltidy program tidy.sourceforge.net/docs/Overview.html

Defined Under Namespace

Classes: FeedListener, FeedParser, HtmlCleaner, HtmlListener

Constant Summary collapse

VERSION =
"0.2.0"
NEWLINE_PLACEHOLDER =
'+---NEWLINE---+'
SPACE_PLACEHOLDER =
'+---SPACE---+'
TAB_PLACEHOLDER =
'+---TAB---+'

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(feed) ⇒ FeedYamlizer

Returns a new instance of FeedYamlizer.



38
39
40
41
# File 'lib/feed_yamlizer.rb', line 38

def initialize(feed)
  @feed = feed
  @result = {:meta => {}, :items => []}
end

Class Method Details

.check_for_tidyObject



107
108
109
110
111
# File 'lib/feed_yamlizer.rb', line 107

def check_for_tidy
  if `which tidy` == ''
    abort "Please install tidy"
  end
end

.format(x, indent = 0) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/feed_yamlizer.rb', line 25

def self.format(x, indent=0)
  res = IO.popen("fmt -w #{75 - indent}", "r+") do |pipe| 
    pipe.puts x
    pipe.close_write
    pipe.read
  end
  if indent
    res.gsub(/^(?=\S)/, ' ' * indent)
  else 
    res
  end
end

.process_url(url) ⇒ Object



126
127
128
129
130
131
132
133
# File 'lib/feed_yamlizer.rb', line 126

def process_url(url)
  response = open(url)
  charset = response.charset
  #STDERR.puts "charset: #{charset}"
  xml = response.read
  encoding = charset || xml_encoding(xml) || "UTF-8"
  run xml, encoding
end

.process_xml(xml) ⇒ Object



122
123
124
# File 'lib/feed_yamlizer.rb', line 122

def process_xml(xml)
  run xml, xml_encoding(xml)
end

.run(feed_xml, encoding = 'UTF-8') ⇒ Object

main method



114
115
116
117
118
119
120
# File 'lib/feed_yamlizer.rb', line 114

def run(feed_xml, encoding='UTF-8')
  check_for_tidy
  feed_xml = Iconv.conv("UTF-8//TRANSLIT//IGNORE", encoding, feed_xml)
  parsed_data = FeedYamlizer::FeedParser.new(feed_xml).result
  result = FeedYamlizer.new(parsed_data).result
  result
end

.xml_encoding(rawxml) ⇒ Object



99
100
101
102
103
104
105
# File 'lib/feed_yamlizer.rb', line 99

def xml_encoding(rawxml)
  encoding = rawxml.encode("ascii", invalid: :replace, undef: :replace)[/encoding=["']([^"']+)["']/,1]
  if $debug
    STDERR.puts "xml encoding: #{encoding.inspect}"
  end
  encoding
end

Instance Method Details

#add_feed_metaresultObject



53
54
55
56
57
58
59
# File 'lib/feed_yamlizer.rb', line 53

def add_feed_metaresult
  @result[:meta] = {
    :title => inner_text(@feed[:title]),
    :link => @feed[:link],
    :xml_encoding => @feed[:xml_encoding]
  }
end

#add_item_metaresult(item, index) ⇒ Object



68
69
70
71
72
73
74
75
76
# File 'lib/feed_yamlizer.rb', line 68

def add_item_metaresult(item, index)
  fields = [:title, :author, :guid, :pub_date, :link, :enclosure, :podcast_image]
  x = {:title => inner_text(item[:title])}
  metaresult = fields.reduce(x) {|memo, field| 
    memo[field] = item[field]
    memo
  }
  @result[:items] << metaresult
end

#add_itemsObject



61
62
63
64
65
66
# File 'lib/feed_yamlizer.rb', line 61

def add_items
  @feed[:items].each_with_index {|item, i| 
    add_item_metaresult item, i
    add_raw_content item, i
  }
end

#add_raw_content(item, index) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/feed_yamlizer.rb', line 78

def add_raw_content(item, index)
  content = (item[:content] || item[:summary] || "").gsub(/^\s*/, '').strip
  @result[:items][-1][:content] = {:html => content}
  # TODO check if HTML or plain text!
  simplified = HtmlCleaner.new(content).output
  #@result[:items][-1][:content][:simplified] = simplified
  textified = simplified.gsub(FeedYamlizer::NEWLINE_PLACEHOLDER, "\n").
      gsub(SPACE_PLACEHOLDER, " ").
      gsub(TAB_PLACEHOLDER, "  ").
      gsub(/^\s+$/, "").
      # eliminate extra blank lines 
      #gsub(/\n{3,}(?!\s)/, "awdkljalwkdjalwkjd lawkdj GOLD klajw d\n\n")
      #gsub(/\n{3,}(?!\s)/m, "\n\n").
      gsub(/\n *\n *\n *$/ , "\n\n")
  # next two lines are dev lines
  #puts textified
  #exit
  @result[:items][-1][:content][:text] = textified
end

#inner_text(string) ⇒ Object



49
50
51
# File 'lib/feed_yamlizer.rb', line 49

def inner_text(string)
  Nokogiri::HTML.parse(string).inner_text
end

#resultObject



43
44
45
46
47
# File 'lib/feed_yamlizer.rb', line 43

def result
  add_feed_metaresult
  add_items
  @result
end