Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/feedparser/textconverters.rb,
lib/feedparser/text-output.rb

Overview

This class provides various converters

Constant Summary collapse

MY_ENTITIES =
{}

Instance Method Summary collapse

Instance Method Details

#escape_htmlObject



17
18
19
20
21
22
# File 'lib/feedparser/textconverters.rb', line 17

def escape_html
  r = self.gsub('&', '&')
  r = r.gsub('<', '&lt;')
  r = r.gsub('>', '&gt;')
  r
end

#escaped_html?Boolean

returns true if the text contains escaped HTML (with HTML entities). used by String#text2html

Returns:

  • (Boolean)


13
14
15
# File 'lib/feedparser/textconverters.rb', line 13

def escaped_html?
  return (self =~ /&lt;img src=/i) || (self =~ /&lt;a href=/i) || (self =~ /&lt;br(\/| \/|)&gt;/i) || (self =~ /&lt;p&gt;/i)
end

#html2text(wrapto = false) ⇒ Object

Convert an HTML text to plain text



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/feedparser/text-output.rb', line 6

def html2text(wrapto = false)
  text = self.clone
  # parse HTML
  p = FeedParser::HTML2TextParser::new(true)
  p.feed(text)
  p.close
  text = p.savedata
  # remove leading and trailing whilespace
  text.gsub!(/\A\s*/m, '')
  text.gsub!(/\s*\Z/m, '')
  # remove whitespace around \n
  text.gsub!(/ *\n/m, "\n")
  text.gsub!(/\n */m, "\n")
  # and duplicates \n
  text.gsub!(/\n\n+/m, "\n\n")
  # and remove duplicated whitespace
  text.gsub!(/[ \t]+/, ' ')

  # finally, wrap the text if requested
  return wrap_text(text, wrapto) if wrapto
  text
end

#html?Boolean

is this text HTML ? search for tags. used by String#text2html

Returns:

  • (Boolean)


8
9
10
# File 'lib/feedparser/textconverters.rb', line 8

def html?
  return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i)
end

#rmWhiteSpace!Object

Remove white space around the text



95
96
97
# File 'lib/feedparser/textconverters.rb', line 95

def rmWhiteSpace!
  return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
end

#text2html(feed) ⇒ Object

convert text to HTML



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/feedparser/textconverters.rb', line 40

def text2html(feed)
  text = self.clone
  realhtml = text.html?
  eschtml = text.escaped_html?
  # fix for RSS feeds with both real and escaped html (crazy!):
  # we take the first one
  if (realhtml && eschtml)
    if (realhtml < eschtml)
      eschtml = nil
    else
      realhtml = nil
    end
  end
  if realhtml
    # do nothing
  elsif eschtml
    text = text.unescape_html
  else
    # paragraphs
    text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
    text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
    # uris
    text.gsub!(/([^'"])(#{URI::DEFAULT_PARSER.make_regexp(['http','ftp','https'])})/,
        '\1<a href="\2">\2</a>')
  end
  # Handle broken hrefs in <a> and <img>
  if feed and feed.link
    text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
      begin
        first, url, last = $1, $3, $4
        if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
          m
        elsif url =~ /^\//
          (first + feed.link.split(/\//)[0..2].join('/') + url + last)
        else
          t = feed.link.split(/\//)
          if t.length == 3 # http://toto with no trailing /
            (first + feed.link + '/' + url + last)
          else
            if feed.link =~ /\/$/
              (first + feed.link + url + last)
            else
              (first + t[0...-1].join('/') + '/' + url + last)
            end
          end
        end
      rescue
        m
      end
    end
  end
  text
end

#toUTF8(inputenc) ⇒ Object

Convert a text in inputenc to a text in UTF8 must take care of wrong input locales



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/feedparser/textconverters.rb', line 101

def toUTF8(inputenc)
  if inputenc.downcase != 'utf-8'
    # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
    begin
      if self.unpack('U*').pack('U*') == self
        return self
      end
    rescue
      # do nothing
    end
    begin
      return self.unpack('C*').pack('U*')
    rescue
      return self #failsafe solution. but a dirty one :-)
    end
  else
    return self
  end
end

#unescape_htmlObject

un-escape HTML in the text. used by String#text2html



31
32
33
34
35
36
37
# File 'lib/feedparser/textconverters.rb', line 31

def unescape_html
  r = self
  MY_ENTITIES.each do |k, v|
    r = r.gsub(k, v)
  end
  r
end

#wrap_text(text, wrapto = 72) ⇒ Object



29
30
31
# File 'lib/feedparser/text-output.rb', line 29

def wrap_text(text, wrapto = 72)
  text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
end