Class: Html2Text
- Inherits:
-
Object
- Object
- Html2Text
- Defined in:
- lib/html2text.rb,
lib/html2text/version.rb
Constant Summary collapse
- VERSION =
"0.1.2"
Instance Attribute Summary collapse
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
Class Method Summary collapse
Instance Method Summary collapse
- #convert ⇒ Object
-
#initialize(doc) ⇒ Html2Text
constructor
A new instance of Html2Text.
- #iterate_over(node) ⇒ Object
- #next_node_name(node) ⇒ Object
- #prefix_whitespace(node) ⇒ Object
- #remove_leading_and_trailing_whitespace(text) ⇒ Object
- #suffix_whitespace(node) ⇒ Object
- #trimmed_whitespace(text) ⇒ Object
-
#wrap_link(node, output) ⇒ Object
links are returned in [text](link) format.
Constructor Details
#initialize(doc) ⇒ Html2Text
Returns a new instance of Html2Text.
6 7 8 |
# File 'lib/html2text.rb', line 6 def initialize(doc) @doc = doc end |
Instance Attribute Details
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
4 5 6 |
# File 'lib/html2text.rb', line 4 def doc @doc end |
Class Method Details
.convert(html) ⇒ Object
10 11 12 13 14 15 |
# File 'lib/html2text.rb', line 10 def self.convert(html) html = fix_newlines(replace_entities(html)) doc = Nokogiri::HTML(html) Html2Text.new(doc).convert end |
.fix_newlines(text) ⇒ Object
17 18 19 |
# File 'lib/html2text.rb', line 17 def self.fix_newlines(text) text.gsub("\r\n", "\n").gsub("\r", "\n") end |
.replace_entities(text) ⇒ Object
21 22 23 |
# File 'lib/html2text.rb', line 21 def self.replace_entities(text) text.gsub(" ", " ") end |
Instance Method Details
#convert ⇒ Object
25 26 27 28 29 |
# File 'lib/html2text.rb', line 25 def convert output = iterate_over(doc) output = remove_leading_and_trailing_whitespace(output) output.strip end |
#iterate_over(node) ⇒ Object
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/html2text.rb', line 52 def iterate_over(node) return trimmed_whitespace(node.text) if node.text? if ["style", "head", "title", "meta", "script"].include?(node.name.downcase) return "" end output = [] output << prefix_whitespace(node) output += node.children.map do |child| iterate_over(child) end output << suffix_whitespace(node) output = output.compact.join("") || "" if node.name.downcase == "a" output = wrap_link(node, output) end output end |
#next_node_name(node) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/html2text.rb', line 40 def next_node_name(node) next_node = node.next_sibling while next_node != nil break if next_node.element? next_node = next_node.next_sibling end if next_node && next_node.element? next_node.name.downcase end end |
#prefix_whitespace(node) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/html2text.rb', line 76 def prefix_whitespace(node) case node.name.downcase when "hr" "------\n" when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul" "\n" when "tr", "p", "div" "\n" when "td", "th" "\t" when "li" "- " end end |
#remove_leading_and_trailing_whitespace(text) ⇒ Object
31 32 33 |
# File 'lib/html2text.rb', line 31 def remove_leading_and_trailing_whitespace(text) text.gsub(/[ \t]*\n[ \t]*/im, "\n") end |
#suffix_whitespace(node) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/html2text.rb', line 95 def suffix_whitespace(node) case node.name.downcase when "h1", "h2", "h3", "h4", "h5", "h6" # add another line "\n" when "p", "br" "\n" if next_node_name(node) != "div" when "li" "\n" when "div" # add one line only if the next child isn't a div "\n" if next_node_name(node) != "div" && next_node_name(node) != nil end end |
#trimmed_whitespace(text) ⇒ Object
35 36 37 38 |
# File 'lib/html2text.rb', line 35 def trimmed_whitespace(text) # Replace whitespace characters with a space (equivalent to \s) text.gsub(/[\t\n\f\r ]+/im, " ") end |
#wrap_link(node, output) ⇒ Object
links are returned in [text](link) format
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/html2text.rb', line 114 def wrap_link(node, output) href = node.attribute("href") name = node.attribute("name") if href.nil? if !name.nil? output = "[#{output}]" end else href = href.to_s if href != output && href != "mailto:#{output}" && href != "http://#{output}" && href != "https://#{output}" output = "[#{output}](#{href})" end end case next_node_name(node) when "h1", "h2", "h3", "h4", "h5", "h6" output += "\n" end output end |