Class: Html2Text

Inherits:

Object

Object
Html2Text

Defined in:: lib/html2text.rb,
lib/html2text/version.rb

Constant Summary collapse

DO_NOT_TOUCH_WHITESPACE =

"<do-not-touch-whitespace>"

VERSION =

"0.3.0"

Instance Attribute Summary collapse

#doc ⇒ Object readonly

Returns the value of attribute doc.

Class Method Summary collapse

Instance Method Summary collapse

#convert ⇒ Object
#initialize(doc) ⇒ Html2Text constructor

A new instance of Html2Text.
#remove_leading_and_trailing_whitespace(text) ⇒ Object

Constructor Details

#initialize(doc) ⇒ `Html2Text`

Returns a new instance of Html2Text.



6
7
8

# File 'lib/html2text.rb', line 6

def initialize(doc)
  @doc = doc
end

Instance Attribute Details

#doc ⇒ `Object` (readonly)

Returns the value of attribute doc.



4
5
6

# File 'lib/html2text.rb', line 4

def doc
  @doc
end

Class Method Details

.convert(html) ⇒ `Object`

# File 'lib/html2text.rb', line 10

def self.convert(html)
  html = html.to_s

  if is_office_document?(html)
    # Emulate the CSS rendering of Office documents
    html = html.gsub("<p class=MsoNormal>", "<br>")
      .gsub("<o:p>&nbsp;</o:p>", "<br>")
      .gsub("<o:p></o:p>", "")
  end

  if !html.include?("<html")
    # Stop Nokogiri from inserting in <p> tags
    html = "<div>#{html}</div>"
  end

  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  Html2Text.new(doc).convert
end

.fix_newlines(text) ⇒ `Object`



31
32
33

# File 'lib/html2text.rb', line 31

def self.fix_newlines(text)
  text.gsub("\r\n", "\n").gsub("\r", "\n")
end

.replace_entities(text) ⇒ `Object`



35
36
37

# File 'lib/html2text.rb', line 35

def self.replace_entities(text)
  text.gsub("&nbsp;", " ").gsub("\u00a0", " ").gsub("&zwnj;", "")
end

Instance Method Details

#convert ⇒ `Object`

# File 'lib/html2text.rb', line 39

def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output = remove_unnecessary_empty_lines(output)
  return output.strip
end

#remove_leading_and_trailing_whitespace(text) ⇒ `Object`

# File 'lib/html2text.rb', line 48

def remove_leading_and_trailing_whitespace(text)
  # ignore any <pre> blocks, which we don't want to interact with
  pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)

  output = []
  pre_blocks.each.with_index do |block, index|
    if index % 2 == 0
      output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
    else
      output << block
    end
  end

  output.join("")
end

Class: Html2Text

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc) ⇒ Html2Text

Instance Attribute Details

#doc ⇒ Object (readonly)

Class Method Details

.convert(html) ⇒ Object

.fix_newlines(text) ⇒ Object

.replace_entities(text) ⇒ Object

Instance Method Details

#convert ⇒ Object

#remove_leading_and_trailing_whitespace(text) ⇒ Object

#initialize(doc) ⇒ `Html2Text`

#doc ⇒ `Object` (readonly)

.convert(html) ⇒ `Object`

.fix_newlines(text) ⇒ `Object`

.replace_entities(text) ⇒ `Object`

#convert ⇒ `Object`

#remove_leading_and_trailing_whitespace(text) ⇒ `Object`