Class: String
- Inherits:
-
Object
- Object
- String
- Defined in:
- lib/feedparser/textconverters.rb,
lib/feedparser/text-output.rb
Overview
This class provides various converters
Constant Summary collapse
- MY_ENTITIES =
{}
Instance Method Summary collapse
- #escape_html ⇒ Object
-
#escaped_html? ⇒ Boolean
returns true if the text contains escaped HTML (with HTML entities).
-
#html2text(wrapto = false) ⇒ Object
Convert an HTML text to plain text.
-
#html? ⇒ Boolean
is this text HTML ? search for tags.
-
#rmWhiteSpace! ⇒ Object
Remove white space around the text.
-
#text2html(feed) ⇒ Object
convert text to HTML.
-
#toUTF8(inputenc) ⇒ Object
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales.
-
#unescape_html ⇒ Object
un-escape HTML in the text.
- #wrap_text(text, wrapto = 72) ⇒ Object
Instance Method Details
#escape_html ⇒ Object
17 18 19 20 21 22 |
# File 'lib/feedparser/textconverters.rb', line 17 def escape_html r = self.gsub('&', '&') r = r.gsub('<', '<') r = r.gsub('>', '>') r end |
#escaped_html? ⇒ Boolean
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
13 14 15 |
# File 'lib/feedparser/textconverters.rb', line 13 def escaped_html? return (self =~ /<img src=/i) || (self =~ /<a href=/i) || (self =~ /<br(\/| \/|)>/i) || (self =~ /<p>/i) end |
#html2text(wrapto = false) ⇒ Object
Convert an HTML text to plain text
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/feedparser/text-output.rb', line 6 def html2text(wrapto = false) text = self.clone # parse HTML p = FeedParser::HTML2TextParser::new(true) p.feed(text) p.close text = p.savedata # remove leading and trailing whilespace text.gsub!(/\A\s*/m, '') text.gsub!(/\s*\Z/m, '') # remove whitespace around \n text.gsub!(/ *\n/m, "\n") text.gsub!(/\n */m, "\n") # and duplicates \n text.gsub!(/\n\n+/m, "\n\n") # and remove duplicated whitespace text.gsub!(/[ \t]+/, ' ') # finally, wrap the text if requested return wrap_text(text, wrapto) if wrapto text end |
#html? ⇒ Boolean
is this text HTML ? search for tags. used by String#text2html
8 9 10 |
# File 'lib/feedparser/textconverters.rb', line 8 def html? return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i) end |
#rmWhiteSpace! ⇒ Object
Remove white space around the text
95 96 97 |
# File 'lib/feedparser/textconverters.rb', line 95 def rmWhiteSpace! return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') end |
#text2html(feed) ⇒ Object
convert text to HTML
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/feedparser/textconverters.rb', line 40 def text2html(feed) text = self.clone realhtml = text.html? eschtml = text.escaped_html? # fix for RSS feeds with both real and escaped html (crazy!): # we take the first one if (realhtml && eschtml) if (realhtml < eschtml) eschtml = nil else realhtml = nil end end if realhtml # do nothing elsif eschtml text = text.unescape_html else # paragraphs text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>') text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>") # uris text.gsub!(/([^'"])(#{URI::DEFAULT_PARSER.make_regexp(['http','ftp','https'])})/, '\1<a href="\2">\2</a>') end # Handle broken hrefs in <a> and <img> if feed and feed.link text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m| begin first, url, last = $1, $3, $4 if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/) m elsif url =~ /^\// (first + feed.link.split(/\//)[0..2].join('/') + url + last) else t = feed.link.split(/\//) if t.length == 3 # http://toto with no trailing / (first + feed.link + '/' + url + last) else if feed.link =~ /\/$/ (first + feed.link + url + last) else (first + t[0...-1].join('/') + '/' + url + last) end end end rescue m end end end text end |
#toUTF8(inputenc) ⇒ Object
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/feedparser/textconverters.rb', line 101 def toUTF8(inputenc) if inputenc.downcase != 'utf-8' # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 begin if self.unpack('U*').pack('U*') == self return self end rescue # do nothing end begin return self.unpack('C*').pack('U*') rescue return self #failsafe solution. but a dirty one :-) end else return self end end |
#unescape_html ⇒ Object
un-escape HTML in the text. used by String#text2html
31 32 33 34 35 36 37 |
# File 'lib/feedparser/textconverters.rb', line 31 def unescape_html r = self MY_ENTITIES.each do |k, v| r = r.gsub(k, v) end r end |
#wrap_text(text, wrapto = 72) ⇒ Object
29 30 31 |
# File 'lib/feedparser/text-output.rb', line 29 def wrap_text(text, wrapto = 72) text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n") end |