Class: WordToMarkdown::Converter
- Inherits:
-
Object
- Object
- WordToMarkdown::Converter
- Defined in:
- lib/word-to-markdown/converter.rb
Constant Summary collapse
- HEADING_DEPTH =
Number of headings to guess, e.g., h6
6
- HEADING_STEP =
100/HEADING_DEPTH
- MIN_HEADING_SIZE =
20
- UNICODE_BULLETS =
["○", "o", "●", "\u2022", "\\p{C}"]
Instance Attribute Summary collapse
-
#document ⇒ Object
readonly
Returns the value of attribute document.
Instance Method Summary collapse
- #convert! ⇒ Object
-
#font_sizes ⇒ Object
Returns an array of font-sizes for implicit headings in the document.
-
#guess_heading(node) ⇒ Object
Given a Nokogiri node, guess what heading it represents, if any.
-
#h(n) ⇒ Object
Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2.
-
#implicit_headings ⇒ Object
Returns an array of Nokogiri nodes that are implicit headings.
-
#initialize(document) ⇒ Converter
constructor
A new instance of Converter.
- #remove_numbering_from_list_items! ⇒ Object
- #remove_paragraphs_from_list_items! ⇒ Object
- #remove_paragraphs_from_tables! ⇒ Object
- #remove_unicode_bullets_from_list_items! ⇒ Object
- #remove_whitespace_from_list_items! ⇒ Object
- #semanticize_font_styles! ⇒ Object
-
#semanticize_headings! ⇒ Object
Try to guess heading where implicit bassed on font size.
- #semanticize_table_headers! ⇒ Object
Constructor Details
#initialize(document) ⇒ Converter
Returns a new instance of Converter.
12 13 14 |
# File 'lib/word-to-markdown/converter.rb', line 12 def initialize(document) @document = document end |
Instance Attribute Details
#document ⇒ Object (readonly)
Returns the value of attribute document.
5 6 7 |
# File 'lib/word-to-markdown/converter.rb', line 5 def document @document end |
Instance Method Details
#convert! ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/word-to-markdown/converter.rb', line 16 def convert! # Fonts and headings semanticize_font_styles! semanticize_headings! # Tables remove_paragraphs_from_tables! semanticize_table_headers! # list items remove_paragraphs_from_list_items! remove_unicode_bullets_from_list_items! remove_whitespace_from_list_items! remove_numbering_from_list_items! end |
#font_sizes ⇒ Object
Returns an array of font-sizes for implicit headings in the document
44 45 46 47 48 49 50 51 52 |
# File 'lib/word-to-markdown/converter.rb', line 44 def font_sizes @font_sizes ||= begin sizes = [] @document.tree.css("[style]").each do |element| sizes.push element.font_size.round(-1) unless element.font_size.nil? end sizes.uniq.sort end end |
#guess_heading(node) ⇒ Object
Given a Nokogiri node, guess what heading it represents, if any
node - the nokigiri node
retuns the heading tag (e.g., H1), or nil
59 60 61 62 63 64 65 |
# File 'lib/word-to-markdown/converter.rb', line 59 def guess_heading(node) return nil if node.font_size == nil [*1...HEADING_DEPTH].each do |heading| return "h#{heading}" if node.font_size >= h(heading) end nil end |
#h(n) ⇒ Object
Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2
n - the heading number, e.g., 1, 2
returns the minimum font size as an integer
73 74 75 |
# File 'lib/word-to-markdown/converter.rb', line 73 def h(n) font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP end |
#implicit_headings ⇒ Object
Returns an array of Nokogiri nodes that are implicit headings
33 34 35 36 37 38 39 40 41 |
# File 'lib/word-to-markdown/converter.rb', line 33 def implicit_headings @implicit_headings ||= begin headings = [] @document.tree.css("[style]").each do |element| headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE end headings end end |
#remove_numbering_from_list_items! ⇒ Object
102 103 104 105 106 107 |
# File 'lib/word-to-markdown/converter.rb', line 102 def remove_numbering_from_list_items! path = WordToMarkdown.soffice.major_version == "5" ? "li span span" : "li span" @document.tree.search(path).each do |span| span.inner_html = span.inner_html.gsub /^[a-zA-Z0-9]+\./m, "" end end |
#remove_paragraphs_from_list_items! ⇒ Object
91 92 93 |
# File 'lib/word-to-markdown/converter.rb', line 91 def remove_paragraphs_from_list_items! @document.tree.search("li p").each { |node| node.node_name = "span" } end |
#remove_paragraphs_from_tables! ⇒ Object
87 88 89 |
# File 'lib/word-to-markdown/converter.rb', line 87 def remove_paragraphs_from_tables! @document.tree.search("td p").each { |node| node.node_name = "span" } end |
#remove_unicode_bullets_from_list_items! ⇒ Object
95 96 97 98 99 100 |
# File 'lib/word-to-markdown/converter.rb', line 95 def remove_unicode_bullets_from_list_items! path = WordToMarkdown.soffice.major_version == "5" ? "li span span" : "li span" @document.tree.search(path).each do |span| span.inner_html = span.inner_html.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, "" end end |
#remove_whitespace_from_list_items! ⇒ Object
109 110 111 |
# File 'lib/word-to-markdown/converter.rb', line 109 def remove_whitespace_from_list_items! @document.tree.search("li span").each { |span| span.inner_html.strip! } end |
#semanticize_font_styles! ⇒ Object
77 78 79 80 81 82 83 84 85 |
# File 'lib/word-to-markdown/converter.rb', line 77 def semanticize_font_styles! @document.tree.css("span").each do |node| if node.bold? node.node_name = "strong" elsif node.italic? node.node_name = "em" end end end |
#semanticize_headings! ⇒ Object
Try to guess heading where implicit bassed on font size
118 119 120 121 122 123 |
# File 'lib/word-to-markdown/converter.rb', line 118 def semanticize_headings! implicit_headings.each do |element| heading = guess_heading element element.node_name = heading unless heading.nil? end end |
#semanticize_table_headers! ⇒ Object
113 114 115 |
# File 'lib/word-to-markdown/converter.rb', line 113 def semanticize_table_headers! @document.tree.search("table tr:first td").each { |node| node.node_name = "th" } end |