Class: WordToMarkdown::Converter

Inherits:

Object

Object
WordToMarkdown::Converter

Defined in:: lib/word-to-markdown/converter.rb

Constant Summary collapse

HEADING_DEPTH = Number of headings to guess, e.g., h6

HEADING_STEP = Percentile step for eaceh eheading

100 / HEADING_DEPTH

MIN_HEADING_SIZE = Minimum heading size

UNICODE_BULLETS = Unicode bullets to strip when processing

['○', 'o', '●', "\u2022", '\\p{C}'].freeze

Instance Attribute Summary collapse

#document ⇒ Object readonly

Returns the value of attribute document.

Instance Method Summary collapse

#convert! ⇒ Object

Convert the document.
#font_sizes ⇒ Array<Integer>

An array of font-sizes for implicit headings in the document.
#guess_heading(node) ⇒ String^?

Given a Nokogiri node, guess what heading it represents, if any.
#h(num) ⇒ Integer

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2.
#implicit_headings ⇒ Array<Nokogiri::Node>

Return an array of Nokogiri Nodes that are implicit headings.
#initialize(document) ⇒ Converter constructor

A new instance of Converter.
#remove_numbering_from_list_items! ⇒ Object

Remove prepended numbers from list items.
#remove_paragraphs_from_list_items! ⇒ Object

Remove top-level paragraphs from list items.
#remove_paragraphs_from_tables! ⇒ Object

Remove top-level paragraphs from table cells.
#remove_unicode_bullets_from_list_items! ⇒ Object

Remove prepended unicode bullets from list items.
#remove_whitespace_from_list_items! ⇒ Object

Remvoe whitespace from list items.
#semanticize_font_styles! ⇒ Object

Convert span-based font styles to ‘strong`s and `em`s.
#semanticize_headings! ⇒ Object

Try to guess heading where implicit bassed on font size.
#semanticize_table_headers! ⇒ Object

Convert table headers to ‘th`s2.

Constructor Details

#initialize(document) ⇒ `Converter`

Returns a new instance of Converter.

Parameters:

document (WordToMarkdown::Document) —

The document to convert



20
21
22

# File 'lib/word-to-markdown/converter.rb', line 20

def initialize(document)
  @document = document
end

Instance Attribute Details

#document ⇒ `Object` (readonly)

Returns the value of attribute document.



5
6
7

# File 'lib/word-to-markdown/converter.rb', line 5

def document
  @document
end

Instance Method Details

#convert! ⇒ `Object`

Convert the document

Note: this action is destructive!

# File 'lib/word-to-markdown/converter.rb', line 27

def convert!
  # Fonts and headings
  semanticize_font_styles!
  semanticize_headings!

  # Tables
  remove_paragraphs_from_tables!
  semanticize_table_headers!

  # list items
  remove_paragraphs_from_list_items!
  remove_unicode_bullets_from_list_items!
  remove_whitespace_from_list_items!
  remove_numbering_from_list_items!
end

#font_sizes ⇒ `Array<Integer>`

Returns An array of font-sizes for implicit headings in the document.

Returns:

(Array<Integer>) —

An array of font-sizes for implicit headings in the document

# File 'lib/word-to-markdown/converter.rb', line 55

def font_sizes
  @font_sizes ||= begin
    sizes = []
    @document.tree.css('[style]').each do |element|
      sizes.push element.font_size.round(-1) unless element.font_size.nil?
    end
    sizes.uniq.sort
  end
end

#guess_heading(node) ⇒ `String`^?

Given a Nokogiri node, guess what heading it represents, if any

Parameters:

node (Nokigiri::Node) —

the nokigiri node

Returns:

(String, nil) —

the heading tag (e.g., H1), or nil

# File 'lib/word-to-markdown/converter.rb', line 69

def guess_heading(node)
  return nil if node.font_size.nil?
  [*1...HEADING_DEPTH].each do |heading|
    return "h#{heading}" if node.font_size >= h(heading)
  end
  nil
end

#h(num) ⇒ `Integer`

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2

Parameters:

num (Integer) —

the heading number, e.g., 1, 2

Returns:

(Integer) —

the minimum font size



83
84
85

# File 'lib/word-to-markdown/converter.rb', line 83

def h(num)
  font_sizes.percentile(((HEADING_DEPTH - 1) - num) * HEADING_STEP)
end

#implicit_headings ⇒ `Array<Nokogiri::Node>`

Return an array of Nokogiri Nodes that are implicit headings

Returns:

(Array<Nokogiri::Node>) —

Return an array of Nokogiri Nodes that are implicit headings

# File 'lib/word-to-markdown/converter.rb', line 44

def implicit_headings
  @implicit_headings ||= begin
    headings = []
    @document.tree.css('[style]').each do |element|
      headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
    end
    headings
  end
end

#remove_numbering_from_list_items! ⇒ `Object`

Remove prepended numbers from list items

# File 'lib/word-to-markdown/converter.rb', line 117

def remove_numbering_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^[a-zA-Z0-9]+\./m, '')
  end
end

#remove_paragraphs_from_list_items! ⇒ `Object`

Remove top-level paragraphs from list items



104
105
106

# File 'lib/word-to-markdown/converter.rb', line 104

def remove_paragraphs_from_list_items!
  @document.tree.search('li p').each { |node| node.node_name = 'span' }
end

#remove_paragraphs_from_tables! ⇒ `Object`

Remove top-level paragraphs from table cells



99
100
101

# File 'lib/word-to-markdown/converter.rb', line 99

def remove_paragraphs_from_tables!
  @document.tree.search('td p').each { |node| node.node_name = 'span' }
end

#remove_unicode_bullets_from_list_items! ⇒ `Object`

Remove prepended unicode bullets from list items

# File 'lib/word-to-markdown/converter.rb', line 109

def remove_unicode_bullets_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^([#{UNICODE_BULLETS.join("")}]+)/, '')
  end
end

#remove_whitespace_from_list_items! ⇒ `Object`

Remvoe whitespace from list items



125
126
127

# File 'lib/word-to-markdown/converter.rb', line 125

def remove_whitespace_from_list_items!
  @document.tree.search('li span').each { |span| span.inner_html.strip! }
end

#semanticize_font_styles! ⇒ `Object`

Convert span-based font styles to ‘strong`s and `em`s

# File 'lib/word-to-markdown/converter.rb', line 88

def semanticize_font_styles!
  @document.tree.css('span').each do |node|
    if node.bold?
      node.node_name = 'strong'
    elsif node.italic?
      node.node_name = 'em'
    end
  end
end

#semanticize_headings! ⇒ `Object`

Try to guess heading where implicit bassed on font size

# File 'lib/word-to-markdown/converter.rb', line 135

def semanticize_headings!
  implicit_headings.each do |element|
    heading = guess_heading element
    element.node_name = heading unless heading.nil?
  end
end

#semanticize_table_headers! ⇒ `Object`

Convert table headers to ‘th`s2



130
131
132

# File 'lib/word-to-markdown/converter.rb', line 130

def semanticize_table_headers!
  @document.tree.search('table tr:first td').each { |node| node.node_name = 'th' }
end

Class: WordToMarkdown::Converter

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ Converter

Instance Attribute Details

#document ⇒ Object (readonly)

Instance Method Details

#convert! ⇒ Object

#font_sizes ⇒ Array<Integer>

#guess_heading(node) ⇒ String?

#h(num) ⇒ Integer

#implicit_headings ⇒ Array<Nokogiri::Node>

#remove_numbering_from_list_items! ⇒ Object

#remove_paragraphs_from_list_items! ⇒ Object

#remove_paragraphs_from_tables! ⇒ Object

#remove_unicode_bullets_from_list_items! ⇒ Object

#remove_whitespace_from_list_items! ⇒ Object

#semanticize_font_styles! ⇒ Object

#semanticize_headings! ⇒ Object

#semanticize_table_headers! ⇒ Object

#initialize(document) ⇒ `Converter`

#document ⇒ `Object` (readonly)

#convert! ⇒ `Object`

#font_sizes ⇒ `Array<Integer>`

#guess_heading(node) ⇒ `String`^?

#h(num) ⇒ `Integer`

#implicit_headings ⇒ `Array<Nokogiri::Node>`

#remove_numbering_from_list_items! ⇒ `Object`

#remove_paragraphs_from_list_items! ⇒ `Object`

#remove_paragraphs_from_tables! ⇒ `Object`

#remove_unicode_bullets_from_list_items! ⇒ `Object`

#remove_whitespace_from_list_items! ⇒ `Object`

#semanticize_font_styles! ⇒ `Object`

#semanticize_headings! ⇒ `Object`

#semanticize_table_headers! ⇒ `Object`