Class: WordToMarkdown::Converter

Inherits:

Object

Object
WordToMarkdown::Converter

show all

Defined in:: lib/word-to-markdown/converter.rb

Constant Summary collapse

HEADING_DEPTH = Number of headings to guess, e.g., h6

HEADING_STEP =

100/HEADING_DEPTH

MIN_HEADING_SIZE =

UNICODE_BULLETS =

["○", "o", "●", "\u2022", "\\p{C}"]

Instance Attribute Summary collapse

#document ⇒ Object readonly

Returns the value of attribute document.

Instance Method Summary collapse

#convert! ⇒ Object
#font_sizes ⇒ Object

Returns an array of font-sizes for implicit headings in the document.
#guess_heading(node) ⇒ Object

Given a Nokogiri node, guess what heading it represents, if any.
#h(n) ⇒ Object

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2.
#implicit_headings ⇒ Object

Returns an array of Nokogiri nodes that are implicit headings.
#initialize(document) ⇒ Converter constructor

A new instance of Converter.
#remove_numbering_from_list_items! ⇒ Object
#remove_paragraphs_from_list_items! ⇒ Object
#remove_paragraphs_from_tables! ⇒ Object
#remove_unicode_bullets_from_list_items! ⇒ Object
#remove_whitespace_from_list_items! ⇒ Object
#semanticize_font_styles! ⇒ Object
#semanticize_headings! ⇒ Object

Try to guess heading where implicit bassed on font size.
#semanticize_table_headers! ⇒ Object

Constructor Details

#initialize(document) ⇒ `Converter`

Returns a new instance of Converter.



12
13
14

# File 'lib/word-to-markdown/converter.rb', line 12

def initialize(document)
  @document = document
end

Instance Attribute Details

#document ⇒ `Object` (readonly)

Returns the value of attribute document.



5
6
7

# File 'lib/word-to-markdown/converter.rb', line 5

def document
  @document
end

Instance Method Details

#convert! ⇒ `Object`

# File 'lib/word-to-markdown/converter.rb', line 16

def convert!
  # Fonts and headings
  semanticize_font_styles!
  semanticize_headings!

  # Tables
  remove_paragraphs_from_tables!
  semanticize_table_headers!

  # list items
  remove_paragraphs_from_list_items!
  remove_unicode_bullets_from_list_items!
  remove_whitespace_from_list_items!
  remove_numbering_from_list_items!
end

#font_sizes ⇒ `Object`

Returns an array of font-sizes for implicit headings in the document

# File 'lib/word-to-markdown/converter.rb', line 44

def font_sizes
  @font_sizes ||= begin
    sizes = []
    @document.tree.css("[style]").each do |element|
      sizes.push element.font_size.round(-1) unless element.font_size.nil?
    end
    sizes.uniq.sort
  end
end

#guess_heading(node) ⇒ `Object`

Given a Nokogiri node, guess what heading it represents, if any

node - the nokigiri node

retuns the heading tag (e.g., H1), or nil

# File 'lib/word-to-markdown/converter.rb', line 59

def guess_heading(node)
  return nil if node.font_size == nil
  [*1...HEADING_DEPTH].each do |heading|
    return "h#{heading}" if node.font_size >= h(heading)
  end
  nil
end

#h(n) ⇒ `Object`

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2

n - the heading number, e.g., 1, 2

returns the minimum font size as an integer



73
74
75

# File 'lib/word-to-markdown/converter.rb', line 73

def h(n)
  font_sizes.percentile ((HEADING_DEPTH-1)-n) * HEADING_STEP
end

#implicit_headings ⇒ `Object`

Returns an array of Nokogiri nodes that are implicit headings

# File 'lib/word-to-markdown/converter.rb', line 33

def implicit_headings
  @implicit_headings ||= begin
    headings = []
    @document.tree.css("[style]").each do |element|
      headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
    end
    headings
  end
end

#remove_numbering_from_list_items! ⇒ `Object`

# File 'lib/word-to-markdown/converter.rb', line 102

def remove_numbering_from_list_items!
  path = WordToMarkdown.soffice.major_version == "5" ? "li span span" : "li span"
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub /^[a-zA-Z0-9]+\./m, ""
  end
end

#remove_paragraphs_from_list_items! ⇒ `Object`



91
92
93

# File 'lib/word-to-markdown/converter.rb', line 91

def remove_paragraphs_from_list_items!
  @document.tree.search("li p").each { |node| node.node_name = "span" }
end

#remove_paragraphs_from_tables! ⇒ `Object`



87
88
89

# File 'lib/word-to-markdown/converter.rb', line 87

def remove_paragraphs_from_tables!
  @document.tree.search("td p").each { |node| node.node_name = "span" }
end

#remove_unicode_bullets_from_list_items! ⇒ `Object`

# File 'lib/word-to-markdown/converter.rb', line 95

def remove_unicode_bullets_from_list_items!
  path = WordToMarkdown.soffice.major_version == "5" ? "li span span" : "li span"
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub /^([#{UNICODE_BULLETS.join("")}]+)/, ""
  end
end

#remove_whitespace_from_list_items! ⇒ `Object`



109
110
111

# File 'lib/word-to-markdown/converter.rb', line 109

def remove_whitespace_from_list_items!
  @document.tree.search("li span").each { |span| span.inner_html.strip! }
end

#semanticize_font_styles! ⇒ `Object`

# File 'lib/word-to-markdown/converter.rb', line 77

def semanticize_font_styles!
  @document.tree.css("span").each do |node|
    if node.bold?
      node.node_name = "strong"
    elsif node.italic?
      node.node_name = "em"
    end
  end
end

#semanticize_headings! ⇒ `Object`

Try to guess heading where implicit bassed on font size

# File 'lib/word-to-markdown/converter.rb', line 118

def semanticize_headings!
  implicit_headings.each do |element|
    heading = guess_heading element
    element.node_name = heading unless heading.nil?
  end
end

#semanticize_table_headers! ⇒ `Object`



113
114
115

# File 'lib/word-to-markdown/converter.rb', line 113

def semanticize_table_headers!
  @document.tree.search("table tr:first td").each { |node| node.node_name = "th" }
end

Class: WordToMarkdown::Converter

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ Converter

Instance Attribute Details

#document ⇒ Object (readonly)

Instance Method Details

#convert! ⇒ Object

#font_sizes ⇒ Object

#guess_heading(node) ⇒ Object

#h(n) ⇒ Object

#implicit_headings ⇒ Object

#remove_numbering_from_list_items! ⇒ Object

#remove_paragraphs_from_list_items! ⇒ Object

#remove_paragraphs_from_tables! ⇒ Object

#remove_unicode_bullets_from_list_items! ⇒ Object

#remove_whitespace_from_list_items! ⇒ Object

#semanticize_font_styles! ⇒ Object

#semanticize_headings! ⇒ Object

#semanticize_table_headers! ⇒ Object