Module: Markitdown
- Defined in:
- lib/markitdown.rb,
lib/markitdown/version.rb
Constant Summary collapse
- VERSION =
"0.3.1"
Class Method Summary collapse
- .from_html(html, language_classifier = nil) ⇒ Object
- .from_nokogiri(node, language_classifier = nil) ⇒ Object
Class Method Details
.from_html(html, language_classifier = nil) ⇒ Object
7 8 9 |
# File 'lib/markitdown.rb', line 7 def self.from_html(html, language_classifier=nil) from_nokogiri(Nokogiri::XML(html).root, language_classifier) end |
.from_nokogiri(node, language_classifier = nil) ⇒ Object
11 12 13 14 15 16 17 |
# File 'lib/markitdown.rb', line 11 def self.from_nokogiri(node, language_classifier=nil) # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2 # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space. # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off. self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ") end |