Module: FeedParser
- Defined in:
- lib/feedparser/sgml-parser.rb,
lib/feedparser/version.rb,
lib/feedparser/feedparser.rb,
lib/feedparser/html-output.rb,
lib/feedparser/text-output.rb,
lib/feedparser/html2text-parser.rb
Overview
A parser for SGML, using the derived class as static DTD. from raa.ruby-lang.org/project/html-parser
Defined Under Namespace
Classes: AtomItem, Feed, FeedItem, HTML2TextParser, RSSItem, SGMLParser, UnknownFeedTypeException
Constant Summary collapse
- VERSION =
"0.10.0"
- STYLESHEET =
<<~EOF <style type="text/css"> body { margin: 2em auto; max-width: 960px; } table.header { margin-bottom: 1em; } table.header, table.metadata, table.attachments { font-family: Helvetica, Verdana, sans-serif; } table.header, table.metadata, table.attachments, pre { width: 100%; padding: 0.5em; background: #eeeeec; border: 1px solid #babdb6; } table.header th, table.metadata th, table.attachments th { text-align: right; width: 50px; } blockquote { font-style: italic; color: #2e3436; border-left: 2px solid #babdb6; padding-left: 0.5em; } hr { border: none; border-top: 1px solid #babdb6; margin: 1em auto } </style> EOF
Class Method Summary collapse
Class Method Details
.getcontent(e, feed = nil) ⇒ Object
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 |
# File 'lib/feedparser/feedparser.rb', line 393 def FeedParser::getcontent(e, feed = nil) encoding = feed ? feed.encoding : 'utf-8' children = e.children.reject do |i| i.class == REXML::Text and i.to_s.chomp == '' end if children.length > 1 s = '' children.each do |c| s += c.to_s if c.class != REXML::Comment end return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed) elsif children.length == 1 c = children[0] if c.class == REXML::Text return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed) elsif c.class == REXML::CData return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed) elsif c.class == REXML::Element # only one element. recurse. return getcontent(c, feed) elsif c.text return c.text.toUTF8(encoding).text2html(feed) end end end |
.recode(str) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/feedparser/feedparser.rb', line 17 def self.recode(str) encoding = nil begin encoding = Magic.guess_string_mime_encoding(str) rescue => e raise unless e.class.to_s =~ /\AMagic::(?:Exception|Error)\z/ # this happens when magic does not find any content at all, e.g. with # strings that contain only whitespace. In these case it *should* be safe # to assume UTF-8 encoding = Encoding::UTF_8 end if encoding == 'unknown-8bit' # find first substring with a valid encoding that is not us-ascii length = 1 # has to start at 1, magic requires at least 2 bytes while length < str.length && ['us-ascii', 'unknown-8bit'].include?(encoding) encoding = Magic.guess_string_mime_encoding(str[0..length]) length = length + 1 end # need to remove iso-8859-1 control characters if encoding == 'iso-8859-1' str = str.bytes.select { |c| c < 128 || c > 159 }.map(&:chr).join end end str.force_encoding(encoding) str = str.chars.select { |c| c.valid_encoding? }.join str.encode('UTF-8') end |