Module: FeedParser

Defined in:
lib/feedparser/sgml-parser.rb,
lib/feedparser/version.rb,
lib/feedparser/feedparser.rb,
lib/feedparser/html-output.rb,
lib/feedparser/text-output.rb,
lib/feedparser/html2text-parser.rb

Overview

A parser for SGML, using the derived class as static DTD. from raa.ruby-lang.org/project/html-parser

Defined Under Namespace

Classes: AtomItem, Feed, FeedItem, HTML2TextParser, RSSItem, SGMLParser, UnknownFeedTypeException

Constant Summary collapse

VERSION =
"0.10.0"
STYLESHEET =
<<~EOF
<style type="text/css">
body {
  margin: 2em auto;
  max-width: 960px;
}

table.header {
  margin-bottom: 1em;
}

table.header, table.metadata, table.attachments {
  font-family: Helvetica, Verdana, sans-serif;
}

table.header, table.metadata, table.attachments, pre {
  width: 100%;
  padding: 0.5em;
  background: #eeeeec;
  border: 1px solid #babdb6;
}

table.header th, table.metadata th, table.attachments th {
  text-align: right;
  width: 50px;
}

blockquote {
  font-style: italic;
  color: #2e3436;
  border-left: 2px solid #babdb6;
  padding-left: 0.5em;
}

hr {
  border: none;
  border-top: 1px solid #babdb6;
  margin: 1em auto
}
</style>
EOF

Class Method Summary collapse

Class Method Details

.getcontent(e, feed = nil) ⇒ Object



393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/feedparser/feedparser.rb', line 393

def FeedParser::getcontent(e, feed = nil)
  encoding = feed ? feed.encoding : 'utf-8'
  children = e.children.reject do |i|
    i.class == REXML::Text and i.to_s.chomp == ''
  end
  if children.length > 1
    s = ''
    children.each do |c|
      s += c.to_s if c.class != REXML::Comment
    end
    return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
  elsif children.length == 1
    c = children[0]
    if c.class == REXML::Text
      return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
    elsif c.class == REXML::CData
      return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
    elsif c.class == REXML::Element
      # only one element. recurse.
      return getcontent(c, feed)
    elsif c.text
      return c.text.toUTF8(encoding).text2html(feed)
    end
  end
end

.recode(str) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/feedparser/feedparser.rb', line 17

def self.recode(str)
  encoding = nil
  begin
    encoding = Magic.guess_string_mime_encoding(str)
  rescue => e
    raise unless e.class.to_s =~ /\AMagic::(?:Exception|Error)\z/
    # this happens when magic does not find any content at all, e.g. with
    # strings that contain only whitespace. In these case it *should* be safe
    # to assume UTF-8
    encoding = Encoding::UTF_8
  end
  if encoding == 'unknown-8bit'
    # find first substring with a valid encoding that is not us-ascii
    length = 1 # has to start at 1, magic requires at least 2 bytes
    while length < str.length && ['us-ascii', 'unknown-8bit'].include?(encoding)
      encoding = Magic.guess_string_mime_encoding(str[0..length])
      length = length + 1
    end
    # need to remove iso-8859-1 control characters
    if encoding == 'iso-8859-1'
      str = str.bytes.select { |c| c < 128 || c > 159 }.map(&:chr).join
    end
  end
  str.force_encoding(encoding)
  str = str.chars.select { |c| c.valid_encoding? }.join
  str.encode('UTF-8')
end