Class: WordToMarkdown::Converter

Inherits:
Object
  • Object
show all
Defined in:
lib/word-to-markdown/converter.rb

Constant Summary collapse

HEADING_DEPTH =

Number of headings to guess, e.g., h6

6
HEADING_STEP =
100/HEADING_DEPTH
MIN_HEADING_SIZE =
20
UNICODE_BULLETS =
["", "o", "", "\uF0B7", "\u2022", "\uF0A7"]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ Converter

Returns a new instance of Converter.



12
13
14
# File 'lib/word-to-markdown/converter.rb', line 12

def initialize(document)
  @document = document
end

Instance Attribute Details

#documentObject (readonly)

Returns the value of attribute document.



5
6
7
# File 'lib/word-to-markdown/converter.rb', line 5

def document
  @document
end

Instance Method Details

#convert!Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/word-to-markdown/converter.rb', line 16

def convert!
  # Fonts and headings
  semanticize_font_styles!
  semanticize_headings!

  # Tables
  remove_paragraphs_from_tables!
  semanticize_table_headers!

  # list items
  remove_paragraphs_from_list_items!
  remove_unicode_bullets_from_list_items!
  remove_whitespace_from_list_items!
  remove_numbering_from_list_items!
end

#font_sizesObject

Returns an array of font-sizes for implicit headings in the document



44
45
46
47
48
49
50
51
52
# File 'lib/word-to-markdown/converter.rb', line 44

def font_sizes
  @font_sizes ||= begin
    sizes = []
    @document.tree.css("[style]").each do |element|
      sizes.push element.font_size.round(-1) unless element.font_size.nil?
    end
    sizes.uniq.sort
  end
end

#guess_heading(node) ⇒ Object

Given a Nokogiri node, guess what heading it represents, if any

node - the nokigiri node

retuns the heading tag (e.g., H1), or nil



59
60
61
62
63
64
65
# File 'lib/word-to-markdown/converter.rb', line 59

def guess_heading(node)
  return nil if node.font_size == nil
  [*1...HEADING_DEPTH].each do |heading|
    return "h#{heading}" if node.font_size >= h(heading)
  end
  nil
end

#h(n) ⇒ Object

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2

n - the heading number, e.g., 1, 2

returns the minimum font size as an integer



73
74
75
# File 'lib/word-to-markdown/converter.rb', line 73

def h(n)
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
end

#implicit_headingsObject

Returns an array of Nokogiri nodes that are implicit headings



33
34
35
36
37
38
39
40
41
# File 'lib/word-to-markdown/converter.rb', line 33

def implicit_headings
  @implicit_headings ||= begin
    headings = []
    @document.tree.css("[style]").each do |element|
      headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
    end
    headings
  end
end

#remove_numbering_from_list_items!Object



101
102
103
104
105
# File 'lib/word-to-markdown/converter.rb', line 101

def remove_numbering_from_list_items!
  @document.tree.search("li span").each do |span|
    span.inner_html = span.inner_html.gsub /^[a-zA-Z0-9]+\./m, ""
  end
end

#remove_paragraphs_from_list_items!Object



91
92
93
# File 'lib/word-to-markdown/converter.rb', line 91

def remove_paragraphs_from_list_items!
  @document.tree.search("li p").each { |node| node.node_name = "span" }
end

#remove_paragraphs_from_tables!Object



87
88
89
# File 'lib/word-to-markdown/converter.rb', line 87

def remove_paragraphs_from_tables!
  @document.tree.search("td p").each { |node| node.node_name = "span" }
end

#remove_unicode_bullets_from_list_items!Object



95
96
97
98
99
# File 'lib/word-to-markdown/converter.rb', line 95

def remove_unicode_bullets_from_list_items!
  @document.tree.search("li span").each do |span|
    span.inner_html = span.inner_html.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
  end
end

#remove_whitespace_from_list_items!Object



107
108
109
# File 'lib/word-to-markdown/converter.rb', line 107

def remove_whitespace_from_list_items!
  @document.tree.search("li span").each { |span| span.inner_html.strip! }
end

#semanticize_font_styles!Object



77
78
79
80
81
82
83
84
85
# File 'lib/word-to-markdown/converter.rb', line 77

def semanticize_font_styles!
  @document.tree.css("span").each do |node|
    if node.bold?
      node.node_name = "strong"
    elsif node.italic?
      node.node_name = "em"
    end
  end
end

#semanticize_headings!Object

Try to guess heading where implicit bassed on font size



116
117
118
119
120
121
# File 'lib/word-to-markdown/converter.rb', line 116

def semanticize_headings!
  implicit_headings.each do |element|
    heading = guess_heading element
    element.node_name = heading unless heading.nil?
  end
end

#semanticize_table_headers!Object



111
112
113
# File 'lib/word-to-markdown/converter.rb', line 111

def semanticize_table_headers!
  @document.tree.search("table tr:first td").each { |node| node.node_name = "th" }
end