Class: WordToMarkdown::Converter

Inherits:
Object
  • Object
show all
Defined in:
lib/word-to-markdown/converter.rb

Constant Summary collapse

HEADING_DEPTH =

Number of headings to guess, e.g., h6

6
HEADING_STEP =

Percentile step for eaceh eheading

100 / HEADING_DEPTH
MIN_HEADING_SIZE =

Minimum heading size

20
UNICODE_BULLETS =

Unicode bullets to strip when processing

['', 'o', '', "\u2022", '\\p{C}'].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ Converter

Returns a new instance of Converter.

Parameters:



20
21
22
# File 'lib/word-to-markdown/converter.rb', line 20

def initialize(document)
  @document = document
end

Instance Attribute Details

#documentObject (readonly)

Returns the value of attribute document.



5
6
7
# File 'lib/word-to-markdown/converter.rb', line 5

def document
  @document
end

Instance Method Details

#convert!Object

Convert the document

Note: this action is destructive!



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/word-to-markdown/converter.rb', line 27

def convert!
  # Fonts and headings
  semanticize_font_styles!
  semanticize_headings!

  # Tables
  remove_paragraphs_from_tables!
  semanticize_table_headers!

  # list items
  remove_paragraphs_from_list_items!
  remove_unicode_bullets_from_list_items!
  remove_whitespace_from_list_items!
  remove_numbering_from_list_items!
end

#font_sizesArray<Integer>

Returns An array of font-sizes for implicit headings in the document.

Returns:

  • (Array<Integer>)

    An array of font-sizes for implicit headings in the document



55
56
57
58
59
60
61
62
63
# File 'lib/word-to-markdown/converter.rb', line 55

def font_sizes
  @font_sizes ||= begin
    sizes = []
    @document.tree.css('[style]').each do |element|
      sizes.push element.font_size.round(-1) unless element.font_size.nil?
    end
    sizes.uniq.sort
  end
end

#guess_heading(node) ⇒ String?

Given a Nokogiri node, guess what heading it represents, if any

Parameters:

  • node (Nokigiri::Node)

    the nokigiri node

Returns:

  • (String, nil)

    the heading tag (e.g., H1), or nil



69
70
71
72
73
74
75
# File 'lib/word-to-markdown/converter.rb', line 69

def guess_heading(node)
  return nil if node.font_size.nil?
  [*1...HEADING_DEPTH].each do |heading|
    return "h#{heading}" if node.font_size >= h(heading)
  end
  nil
end

#h(num) ⇒ Integer

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2

Parameters:

  • num (Integer)

    the heading number, e.g., 1, 2

Returns:

  • (Integer)

    the minimum font size



83
84
85
# File 'lib/word-to-markdown/converter.rb', line 83

def h(num)
  font_sizes.percentile(((HEADING_DEPTH - 1) - num) * HEADING_STEP)
end

#implicit_headingsArray<Nokogiri::Node>

Return an array of Nokogiri Nodes that are implicit headings

Returns:

  • (Array<Nokogiri::Node>)

    Return an array of Nokogiri Nodes that are implicit headings



44
45
46
47
48
49
50
51
52
# File 'lib/word-to-markdown/converter.rb', line 44

def implicit_headings
  @implicit_headings ||= begin
    headings = []
    @document.tree.css('[style]').each do |element|
      headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
    end
    headings
  end
end

#remove_numbering_from_list_items!Object

Remove prepended numbers from list items



117
118
119
120
121
122
# File 'lib/word-to-markdown/converter.rb', line 117

def remove_numbering_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^[a-zA-Z0-9]+\./m, '')
  end
end

#remove_paragraphs_from_list_items!Object

Remove top-level paragraphs from list items



104
105
106
# File 'lib/word-to-markdown/converter.rb', line 104

def remove_paragraphs_from_list_items!
  @document.tree.search('li p').each { |node| node.node_name = 'span' }
end

#remove_paragraphs_from_tables!Object

Remove top-level paragraphs from table cells



99
100
101
# File 'lib/word-to-markdown/converter.rb', line 99

def remove_paragraphs_from_tables!
  @document.tree.search('td p').each { |node| node.node_name = 'span' }
end

#remove_unicode_bullets_from_list_items!Object

Remove prepended unicode bullets from list items



109
110
111
112
113
114
# File 'lib/word-to-markdown/converter.rb', line 109

def remove_unicode_bullets_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^([#{UNICODE_BULLETS.join("")}]+)/, '')
  end
end

#remove_whitespace_from_list_items!Object

Remvoe whitespace from list items



125
126
127
# File 'lib/word-to-markdown/converter.rb', line 125

def remove_whitespace_from_list_items!
  @document.tree.search('li span').each { |span| span.inner_html.strip! }
end

#semanticize_font_styles!Object

Convert span-based font styles to ‘strong`s and `em`s



88
89
90
91
92
93
94
95
96
# File 'lib/word-to-markdown/converter.rb', line 88

def semanticize_font_styles!
  @document.tree.css('span').each do |node|
    if node.bold?
      node.node_name = 'strong'
    elsif node.italic?
      node.node_name = 'em'
    end
  end
end

#semanticize_headings!Object

Try to guess heading where implicit bassed on font size



135
136
137
138
139
140
# File 'lib/word-to-markdown/converter.rb', line 135

def semanticize_headings!
  implicit_headings.each do |element|
    heading = guess_heading element
    element.node_name = heading unless heading.nil?
  end
end

#semanticize_table_headers!Object

Convert table headers to ‘th`s2



130
131
132
# File 'lib/word-to-markdown/converter.rb', line 130

def semanticize_table_headers!
  @document.tree.search('table tr:first td').each { |node| node.node_name = 'th' }
end