Class: WordToMarkdown::Converter
- Inherits:
-
Object
- Object
- WordToMarkdown::Converter
- Defined in:
- lib/word-to-markdown/converter.rb
Constant Summary collapse
- HEADING_DEPTH =
Number of headings to guess, e.g., h6
6
- HEADING_STEP =
Percentile step for eaceh eheading
100 / HEADING_DEPTH
- MIN_HEADING_SIZE =
Minimum heading size
20
- UNICODE_BULLETS =
Unicode bullets to strip when processing
['○', 'o', '●', "\u2022", '\\p{C}'].freeze
Instance Attribute Summary collapse
-
#document ⇒ Object
readonly
Returns the value of attribute document.
Instance Method Summary collapse
-
#convert! ⇒ Object
Convert the document.
-
#font_sizes ⇒ Array<Integer>
An array of font-sizes for implicit headings in the document.
-
#guess_heading(node) ⇒ String?
Given a Nokogiri node, guess what heading it represents, if any.
-
#h(num) ⇒ Integer
Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2.
-
#implicit_headings ⇒ Array<Nokogiri::Node>
Return an array of Nokogiri Nodes that are implicit headings.
-
#initialize(document) ⇒ Converter
constructor
A new instance of Converter.
-
#remove_numbering_from_list_items! ⇒ Object
Remove prepended numbers from list items.
-
#remove_paragraphs_from_list_items! ⇒ Object
Remove top-level paragraphs from list items.
-
#remove_paragraphs_from_tables! ⇒ Object
Remove top-level paragraphs from table cells.
-
#remove_unicode_bullets_from_list_items! ⇒ Object
Remove prepended unicode bullets from list items.
-
#remove_whitespace_from_list_items! ⇒ Object
Remvoe whitespace from list items.
-
#semanticize_font_styles! ⇒ Object
Convert span-based font styles to ‘strong`s and `em`s.
-
#semanticize_headings! ⇒ Object
Try to guess heading where implicit bassed on font size.
-
#semanticize_table_headers! ⇒ Object
Convert table headers to ‘th`s2.
Constructor Details
#initialize(document) ⇒ Converter
Returns a new instance of Converter.
20 21 22 |
# File 'lib/word-to-markdown/converter.rb', line 20 def initialize(document) @document = document end |
Instance Attribute Details
#document ⇒ Object (readonly)
Returns the value of attribute document.
5 6 7 |
# File 'lib/word-to-markdown/converter.rb', line 5 def document @document end |
Instance Method Details
#convert! ⇒ Object
Convert the document
Note: this action is destructive!
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/word-to-markdown/converter.rb', line 27 def convert! # Fonts and headings semanticize_font_styles! semanticize_headings! # Tables remove_paragraphs_from_tables! semanticize_table_headers! # list items remove_paragraphs_from_list_items! remove_unicode_bullets_from_list_items! remove_whitespace_from_list_items! remove_numbering_from_list_items! end |
#font_sizes ⇒ Array<Integer>
Returns An array of font-sizes for implicit headings in the document.
55 56 57 58 59 60 61 62 63 |
# File 'lib/word-to-markdown/converter.rb', line 55 def font_sizes @font_sizes ||= begin sizes = [] @document.tree.css('[style]').each do |element| sizes.push element.font_size.round(-1) unless element.font_size.nil? end sizes.uniq.sort end end |
#guess_heading(node) ⇒ String?
Given a Nokogiri node, guess what heading it represents, if any
69 70 71 72 73 74 75 |
# File 'lib/word-to-markdown/converter.rb', line 69 def guess_heading(node) return nil if node.font_size.nil? [*1...HEADING_DEPTH].each do |heading| return "h#{heading}" if node.font_size >= h(heading) end nil end |
#h(num) ⇒ Integer
Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2
83 84 85 |
# File 'lib/word-to-markdown/converter.rb', line 83 def h(num) font_sizes.percentile(((HEADING_DEPTH - 1) - num) * HEADING_STEP) end |
#implicit_headings ⇒ Array<Nokogiri::Node>
Return an array of Nokogiri Nodes that are implicit headings
44 45 46 47 48 49 50 51 52 |
# File 'lib/word-to-markdown/converter.rb', line 44 def implicit_headings @implicit_headings ||= begin headings = [] @document.tree.css('[style]').each do |element| headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE end headings end end |
#remove_numbering_from_list_items! ⇒ Object
Remove prepended numbers from list items
117 118 119 120 121 122 |
# File 'lib/word-to-markdown/converter.rb', line 117 def remove_numbering_from_list_items! path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span' @document.tree.search(path).each do |span| span.inner_html = span.inner_html.gsub(/^[a-zA-Z0-9]+\./m, '') end end |
#remove_paragraphs_from_list_items! ⇒ Object
Remove top-level paragraphs from list items
104 105 106 |
# File 'lib/word-to-markdown/converter.rb', line 104 def remove_paragraphs_from_list_items! @document.tree.search('li p').each { |node| node.node_name = 'span' } end |
#remove_paragraphs_from_tables! ⇒ Object
Remove top-level paragraphs from table cells
99 100 101 |
# File 'lib/word-to-markdown/converter.rb', line 99 def remove_paragraphs_from_tables! @document.tree.search('td p').each { |node| node.node_name = 'span' } end |
#remove_unicode_bullets_from_list_items! ⇒ Object
Remove prepended unicode bullets from list items
109 110 111 112 113 114 |
# File 'lib/word-to-markdown/converter.rb', line 109 def remove_unicode_bullets_from_list_items! path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span' @document.tree.search(path).each do |span| span.inner_html = span.inner_html.gsub(/^([#{UNICODE_BULLETS.join("")}]+)/, '') end end |
#remove_whitespace_from_list_items! ⇒ Object
Remvoe whitespace from list items
125 126 127 |
# File 'lib/word-to-markdown/converter.rb', line 125 def remove_whitespace_from_list_items! @document.tree.search('li span').each { |span| span.inner_html.strip! } end |
#semanticize_font_styles! ⇒ Object
Convert span-based font styles to ‘strong`s and `em`s
88 89 90 91 92 93 94 95 96 |
# File 'lib/word-to-markdown/converter.rb', line 88 def semanticize_font_styles! @document.tree.css('span').each do |node| if node.bold? node.node_name = 'strong' elsif node.italic? node.node_name = 'em' end end end |
#semanticize_headings! ⇒ Object
Try to guess heading where implicit bassed on font size
135 136 137 138 139 140 |
# File 'lib/word-to-markdown/converter.rb', line 135 def semanticize_headings! implicit_headings.each do |element| heading = guess_heading element element.node_name = heading unless heading.nil? end end |
#semanticize_table_headers! ⇒ Object
Convert table headers to ‘th`s2
130 131 132 |
# File 'lib/word-to-markdown/converter.rb', line 130 def semanticize_table_headers! @document.tree.search('table tr:first td').each { |node| node.node_name = 'th' } end |