Module: HtmlToPlainText

Included in:
Premailer
Defined in:
lib/html_to_plain_text.rb

Overview

Support functions for Premailer

Instance Method Summary collapse

Instance Method Details

#convert_to_text(html, line_length, from_charset = 'UTF-8', shorten = false) ⇒ Object

Returns the text in UTF-8 format with all HTML tags removed

TODO:

- add support for DL, OL


12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/html_to_plain_text.rb', line 12

def convert_to_text(html, line_length, from_charset = 'UTF-8', shorten = false)
  r = Text::Reform.new(:trim => true, 
                       :squeeze => false, 
                       :break => Text::Reform.break_wrap)

  txt = html

bitly = shorten ? Bitly4R.Keyed("mikedamage", "R_abb45e99634386334b7ed6c8d081e80e") : nil

  he = HTMLEntities.new                                 # decode HTML entities

  txt = he.decode(txt)

  txt.gsub!(/<h([0-9]+)[^>]*>(.*)<\/h[0-9]+>/i) do |s|  # handle headings
    hlevel = $1.to_i
    htext = $2.gsub(/<\/?[^>]*>/i, '')                  # remove tags inside headings
    hlength = (htext.length > line_length ? 
                line_length : 
                htext.length)

    case hlevel
      when 1                                            # H1
        ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength) + "\n"
      when 2                                            # H2
        ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength) + "\n"
      else                                              # H3-H6 are styled the same
        htext + "\n" + ('-' * htext.length) + "\n"
    end
  end

  txt.gsub!(/<a.*href=\"([^\"]*)\"[^>]*>(.*)<\/a>/i) do |s|   # links
	if bitly
		$2 + ' [' + bitly.shorten($1) + ']'
	else
		$2 + ' [' + $1 + ']'
	end
  end

  txt.gsub!(/(<li[\s]+[^>]*>|<li>)/i, '  * ')           # unordered LIsts
  txt.gsub!(/<\/p>/i, "\n\n")                           # paragraphs
  
  txt.gsub!(/<\/?[^>]*>/, '')                           # strip remaining tags
  txt.gsub!(/\A[\s]+|[\s]+\Z|^[ \t]+/m, '')             # strip extra spaces
  txt.gsub!(/[\n]{3,}/m, "\n\n")                        # tighten line breaks

  txt = r.format(('[' * line_length), txt)   # wrap text
  txt.gsub!(/^[\*][\s]/m, '  * ')                        # add spaces back to lists
  txt
end