Class: Html2Text

Inherits:

Object

Object
Html2Text

show all

Defined in:: lib/html2text.rb,
lib/html2text/version.rb

Constant Summary collapse

VERSION =

"0.2.1"

Instance Attribute Summary collapse

#doc ⇒ Object readonly

Returns the value of attribute doc.

Class Method Summary collapse

Instance Method Summary collapse

#convert ⇒ Object
#image_text(node) ⇒ Object
#initialize(doc) ⇒ Html2Text constructor

A new instance of Html2Text.
#iterate_over(node) ⇒ Object
#next_node_name(node) ⇒ Object
#prefix_whitespace(node) ⇒ Object
#remove_leading_and_trailing_whitespace(text) ⇒ Object
#remove_unnecessary_empty_lines(text) ⇒ Object
#suffix_whitespace(node) ⇒ Object
#trimmed_whitespace(text) ⇒ Object
#wrap_link(node, output) ⇒ Object

links are returned in [text](link) format.

Constructor Details

#initialize(doc) ⇒ `Html2Text`

Returns a new instance of Html2Text.



6
7
8

# File 'lib/html2text.rb', line 6

def initialize(doc)
  @doc = doc
end

Instance Attribute Details

#doc ⇒ `Object` (readonly)

Returns the value of attribute doc.



4
5
6

# File 'lib/html2text.rb', line 4

def doc
  @doc
end

Class Method Details

.convert(html) ⇒ `Object`

# File 'lib/html2text.rb', line 10

def self.convert(html)
  html = html.to_s
  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  Html2Text.new(doc).convert
end

.fix_newlines(text) ⇒ `Object`



18
19
20

# File 'lib/html2text.rb', line 18

def self.fix_newlines(text)
  text.gsub("\r\n", "\n").gsub("\r", "\n")
end

.replace_entities(text) ⇒ `Object`



22
23
24

# File 'lib/html2text.rb', line 22

def self.replace_entities(text)
  text.gsub("&nbsp;", " ").gsub("\u00a0", " ")
end

Instance Method Details

#convert ⇒ `Object`

# File 'lib/html2text.rb', line 26

def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output = remove_unnecessary_empty_lines(output)
  output.strip
end

#image_text(node) ⇒ `Object`

# File 'lib/html2text.rb', line 169

def image_text(node)
  if node.attribute("title")
    "[" + node.attribute("title").to_s + "]"
  elsif node.attribute("alt")
    "[" + node.attribute("alt").to_s + "]"
  else
    ""
  end
end

#iterate_over(node) ⇒ `Object`

# File 'lib/html2text.rb', line 58

def iterate_over(node)
  return trimmed_whitespace(node.text) if node.text?

  if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
    return ""
  end

  output = []

  output << prefix_whitespace(node)
  output += node.children.map do |child|
    iterate_over(child)
  end
  output << suffix_whitespace(node)

  output = output.compact.join("") || ""

  if node.name.downcase == "a"
    output = wrap_link(node, output)
  end
  if node.name.downcase == "img"
    output = image_text(node)
  end

  output
end

#next_node_name(node) ⇒ `Object`

# File 'lib/html2text.rb', line 46

def next_node_name(node)
  next_node = node.next_sibling
  while next_node != nil
    break if next_node.element?
    next_node = next_node.next_sibling
  end

  if next_node && next_node.element?
    next_node.name.downcase
  end
end

#prefix_whitespace(node) ⇒ `Object`

# File 'lib/html2text.rb', line 85

def prefix_whitespace(node)
  case node.name.downcase
    when "hr"
      "---------------------------------------------------------------\n"

    when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
      "\n"

    when "tr", "p", "div"
      "\n"

    when "td", "th"
      "\t"

    when "li"
      "- "
  end
end

#remove_leading_and_trailing_whitespace(text) ⇒ `Object`



33
34
35

# File 'lib/html2text.rb', line 33

def remove_leading_and_trailing_whitespace(text)
  text.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
end

#remove_unnecessary_empty_lines(text) ⇒ `Object`



37
38
39

# File 'lib/html2text.rb', line 37

def remove_unnecessary_empty_lines(text)
  text.gsub(/\n\n\n*/im, "\n\n")
end

#suffix_whitespace(node) ⇒ `Object`

# File 'lib/html2text.rb', line 104

def suffix_whitespace(node)
  case node.name.downcase
    when "h1", "h2", "h3", "h4", "h5", "h6"
      # add another line
      "\n"

    when "p", "br"
      "\n" if next_node_name(node) != "div"

    when "li"
      "\n"

    when "div"
      # add one line only if the next child isn't a div
      "\n" if next_node_name(node) != "div" && next_node_name(node) != nil
  end
end

#trimmed_whitespace(text) ⇒ `Object`

# File 'lib/html2text.rb', line 41

def trimmed_whitespace(text)
  # Replace whitespace characters with a space (equivalent to \s)
  text.gsub(/[\t\n\f\r ]+/im, " ")
end

#wrap_link(node, output) ⇒ `Object`

links are returned in [text](link) format

# File 'lib/html2text.rb', line 123

def wrap_link(node, output)
  href = node.attribute("href")
  name = node.attribute("name")

  output = output.strip

  # remove double [[ ]]s from linking images
  if output[0] == "[" && output[-1] == "]"
    output = output[1, output.length - 2]

    # for linking images, the title of the <a> overrides the title of the <img>
    if node.attribute("title")
      output = node.attribute("title").to_s
    end
  end

  # if there is no link text, but a title attr
  if output.empty? && node.attribute("title")
    output = node.attribute("title").to_s
  end

  if href.nil?
    if !name.nil?
      output = "[#{output}]"
    end
  else
    href = href.to_s

    if href != output && href != "mailto:#{output}" &&
        href != "http://#{output}" && href != "https://#{output}"
      if output.empty?
        output = href
      else
        output = "[#{output}](#{href})"
      end
    end
  end

  case next_node_name(node)
    when "h1", "h2", "h3", "h4", "h5", "h6"
      output += "\n"
  end

  output
end

Class: Html2Text

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc) ⇒ Html2Text

Instance Attribute Details

#doc ⇒ Object (readonly)

Class Method Details

.convert(html) ⇒ Object

.fix_newlines(text) ⇒ Object

.replace_entities(text) ⇒ Object

Instance Method Details

#convert ⇒ Object

#image_text(node) ⇒ Object

#iterate_over(node) ⇒ Object

#next_node_name(node) ⇒ Object

#prefix_whitespace(node) ⇒ Object

#remove_leading_and_trailing_whitespace(text) ⇒ Object

#remove_unnecessary_empty_lines(text) ⇒ Object

#suffix_whitespace(node) ⇒ Object

#trimmed_whitespace(text) ⇒ Object

#wrap_link(node, output) ⇒ Object

#initialize(doc) ⇒ `Html2Text`

#doc ⇒ `Object` (readonly)

.convert(html) ⇒ `Object`

.fix_newlines(text) ⇒ `Object`

.replace_entities(text) ⇒ `Object`

#convert ⇒ `Object`

#image_text(node) ⇒ `Object`

#iterate_over(node) ⇒ `Object`

#next_node_name(node) ⇒ `Object`

#prefix_whitespace(node) ⇒ `Object`

#remove_leading_and_trailing_whitespace(text) ⇒ `Object`

#remove_unnecessary_empty_lines(text) ⇒ `Object`

#suffix_whitespace(node) ⇒ `Object`

#trimmed_whitespace(text) ⇒ `Object`

#wrap_link(node, output) ⇒ `Object`