Module: Markitdown
- Defined in:
- lib/markitdown.rb,
lib/markitdown/version.rb
Constant Summary collapse
- VERSION =
"0.1.2"
Class Method Summary collapse
Class Method Details
.from_html(html) ⇒ Object
7 8 9 |
# File 'lib/markitdown.rb', line 7 def self.from_html(html) from_nokogiri(Nokogiri::XML(html).root) end |
.from_nokogiri(node) ⇒ Object
11 12 13 14 15 16 17 |
# File 'lib/markitdown.rb', line 11 def self.from_nokogiri(node) # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2 # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space. # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off. self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1") end |