Class: Html2Text

Inherits:
Object
  • Object
show all
Defined in:
lib/html2text.rb,
lib/html2text/version.rb

Constant Summary collapse

DO_NOT_TOUCH_WHITESPACE =
"<do-not-touch-whitespace>"
VERSION =
"0.3.0"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc) ⇒ Html2Text

Returns a new instance of Html2Text.



6
7
8
# File 'lib/html2text.rb', line 6

def initialize(doc)
  @doc = doc
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



4
5
6
# File 'lib/html2text.rb', line 4

def doc
  @doc
end

Class Method Details

.convert(html) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/html2text.rb', line 10

def self.convert(html)
  html = html.to_s

  if is_office_document?(html)
    # Emulate the CSS rendering of Office documents
    html = html.gsub("<p class=MsoNormal>", "<br>")
      .gsub("<o:p>&nbsp;</o:p>", "<br>")
      .gsub("<o:p></o:p>", "")
  end

  if !html.include?("<html")
    # Stop Nokogiri from inserting in <p> tags
    html = "<div>#{html}</div>"
  end

  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  Html2Text.new(doc).convert
end

.fix_newlines(text) ⇒ Object



31
32
33
# File 'lib/html2text.rb', line 31

def self.fix_newlines(text)
  text.gsub("\r\n", "\n").gsub("\r", "\n")
end

.replace_entities(text) ⇒ Object



35
36
37
# File 'lib/html2text.rb', line 35

def self.replace_entities(text)
  text.gsub("&nbsp;", " ").gsub("\u00a0", " ").gsub("&zwnj;", "")
end

Instance Method Details

#convertObject



39
40
41
42
43
44
# File 'lib/html2text.rb', line 39

def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output = remove_unnecessary_empty_lines(output)
  return output.strip
end

#remove_leading_and_trailing_whitespace(text) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/html2text.rb', line 48

def remove_leading_and_trailing_whitespace(text)
  # ignore any <pre> blocks, which we don't want to interact with
  pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)

  output = []
  pre_blocks.each.with_index do |block, index|
    if index % 2 == 0
      output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
    else
      output << block
    end
  end

  output.join("")
end