Class: Html2Text

Inherits:
Object
  • Object
show all
Defined in:
lib/html2text.rb,
lib/html2text/version.rb

Constant Summary collapse

VERSION =
"0.1.2"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc) ⇒ Html2Text

Returns a new instance of Html2Text.



6
7
8
# File 'lib/html2text.rb', line 6

def initialize(doc)
  @doc = doc
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



4
5
6
# File 'lib/html2text.rb', line 4

def doc
  @doc
end

Class Method Details

.convert(html) ⇒ Object



10
11
12
13
14
15
# File 'lib/html2text.rb', line 10

def self.convert(html)
  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  Html2Text.new(doc).convert
end

.fix_newlines(text) ⇒ Object



17
18
19
# File 'lib/html2text.rb', line 17

def self.fix_newlines(text)
  text.gsub("\r\n", "\n").gsub("\r", "\n")
end

.replace_entities(text) ⇒ Object



21
22
23
# File 'lib/html2text.rb', line 21

def self.replace_entities(text)
  text.gsub(" ", " ")
end

Instance Method Details

#convertObject



25
26
27
28
29
# File 'lib/html2text.rb', line 25

def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output.strip
end

#iterate_over(node) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/html2text.rb', line 52

def iterate_over(node)
  return trimmed_whitespace(node.text) if node.text?

  if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
    return ""
  end

  output = []

  output << prefix_whitespace(node)
  output += node.children.map do |child|
    iterate_over(child)
  end
  output << suffix_whitespace(node)

  output = output.compact.join("") || ""

  if node.name.downcase == "a"
    output = wrap_link(node, output)
  end

  output
end

#next_node_name(node) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
# File 'lib/html2text.rb', line 40

def next_node_name(node)
  next_node = node.next_sibling
  while next_node != nil
    break if next_node.element?
    next_node = next_node.next_sibling
  end

  if next_node && next_node.element?
    next_node.name.downcase
  end
end

#prefix_whitespace(node) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/html2text.rb', line 76

def prefix_whitespace(node)
  case node.name.downcase
    when "hr"
      "------\n"

    when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
      "\n"

    when "tr", "p", "div"
      "\n"

    when "td", "th"
      "\t"

    when "li"
      "- "
  end
end

#remove_leading_and_trailing_whitespace(text) ⇒ Object



31
32
33
# File 'lib/html2text.rb', line 31

def remove_leading_and_trailing_whitespace(text)
  text.gsub(/[ \t]*\n[ \t]*/im, "\n")
end

#suffix_whitespace(node) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/html2text.rb', line 95

def suffix_whitespace(node)
  case node.name.downcase
    when "h1", "h2", "h3", "h4", "h5", "h6"
      # add another line
      "\n"

    when "p", "br"
      "\n" if next_node_name(node) != "div"

    when "li"
      "\n"

    when "div"
      # add one line only if the next child isn't a div
      "\n" if next_node_name(node) != "div" && next_node_name(node) != nil
  end
end

#trimmed_whitespace(text) ⇒ Object



35
36
37
38
# File 'lib/html2text.rb', line 35

def trimmed_whitespace(text)
  # Replace whitespace characters with a space (equivalent to \s)
  text.gsub(/[\t\n\f\r ]+/im, " ")
end

links are returned in [text](link) format



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/html2text.rb', line 114

def wrap_link(node, output)
  href = node.attribute("href")
  name = node.attribute("name")

  if href.nil?
    if !name.nil?
      output = "[#{output}]"
    end
  else
    href = href.to_s

    if href != output && href != "mailto:#{output}" &&
        href != "http://#{output}" && href != "https://#{output}"
      output = "[#{output}](#{href})"
    end
  end

  case next_node_name(node)
    when "h1", "h2", "h3", "h4", "h5", "h6"
      output += "\n"
  end

  output
end