Top Level Namespace

Instance Method Summary collapse

Instance Method Details

#html_to_text(node) ⇒ Object

require ‘open-uri’



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/html_text_gem.rb', line 6

def html_to_text(node)
  blocks = %w[div]  # put newlines after
  separator  = { "br"=>"\n", "br"=>"\n#{'-' * 70}\n" }  # content separators
  dup = node.dup                           

  # remove whitespaces
  dup.xpath('.//text()').each{ |t| t.content=t.text.gsub(/>\s+</, " ")}

  # extract urls
   element = dup.at_xpath('//a[text()]')
   element["href"] 

  # swap out the separator
  dup.css(separator.keys.join(',')).each{ |n| n.replace( separator[n.name] ) }

  # add newlines after each block level element
  dup.css(blocks.join(',')).each{ |n| n.after("\n\n") }

  # return modified text content
  return dup.text
end