Module: Markitdown

Defined in:
lib/markitdown.rb,
lib/markitdown/version.rb

Constant Summary collapse

VERSION =
"0.2.1"

Class Method Summary collapse

Class Method Details

.from_html(html) ⇒ Object



7
8
9
# File 'lib/markitdown.rb', line 7

def self.from_html(html)
  from_nokogiri(Nokogiri::XML(html).root)
end

.from_nokogiri(node) ⇒ Object



11
12
13
14
15
16
17
# File 'lib/markitdown.rb', line 11

def self.from_nokogiri(node)
  # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
  self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
end