Class: Onigiri::Document

Inherits:
Nokogiri::HTML::DocumentFragment
  • Object
show all
Defined in:
lib/onigiri.rb,
lib/onigiri/enclose_text.rb,
lib/onigiri/merge_by_tag.rb,
lib/onigiri/fix_backslash.rb,
lib/onigiri/hide_comments.rb,
lib/onigiri/show_body_only.rb,
lib/onigiri/drop_empty_paras.rb,
lib/onigiri/enclose_block_text.rb

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.parse(tags) ⇒ Object



24
25
26
27
28
29
# File 'lib/onigiri.rb', line 24

def parse(tags)
  # Remove formatting whitespaces
  # Those do not represent any data while messing up the tree
  tags = tags.gsub(/(\r|\n)/, '').gsub(/> *</, '><') if tags.respond_to? :gsub
  super
end

Instance Method Details

#drop_empty_parasObject



5
6
7
8
9
10
11
# File 'lib/onigiri/drop_empty_paras.rb', line 5

def drop_empty_paras
  dupe = dup
  dupe.css('p').each do |p|
    p.remove if p.children.empty?
  end
  dupe
end

#enclose_block_textObject



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/onigiri/enclose_block_text.rb', line 5

def enclose_block_text
  dupe = dup
  strict_tags = {"noscript" => 1, "form" => 1, "blockquote" => 1}
  dupe.traverse do |elem|
    if strict_tags[elem.name]
      elem.children.each do |target|
        if target.text?
          target.add_previous_sibling "<p>#{target.content.strip}</p>"
          target.unlink
        end
      end
    end
  end
  dupe
end

#enclose_textObject



5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/onigiri/enclose_text.rb', line 5

def enclose_text
  dupe = dup
  wrapper = Onigiri::Document.parse('<p>').child
  body = dupe.css('body').children
  body = dupe.children if body.empty?
  body.each do |target|
    if target.parent && (target.text? || target.description.inline?)
      wrap = target.add_previous_sibling(wrapper)
      wrap << target.unlink
    end
  end
  dupe
end

#find_merger_elements(tag_name) ⇒ Object



33
34
35
36
37
38
39
40
# File 'lib/onigiri/merge_by_tag.rb', line 33

def find_merger_elements(tag_name)
  result = []
  self.css(tag_name).each do |elem|
    # !(node.next_sibling || node.previous_sibling) vs. node.parent.children.size
    result << elem if elem.parent.children.size == 1 && elem.parent.name == tag_name
  end
  result
end

#fix_backslashObject



5
6
7
8
9
10
11
12
13
14
# File 'lib/onigiri/fix_backslash.rb', line 5

def fix_backslash
  dupe = dup
  attrset = ['src', 'longdesc', 'href', 'action']
  dupe.css("[#{attrset.join('], [')}]").each do |target|
    attrset.each do |attr|
      target[attr] = target[attr].gsub("\\", "/") if target[attr]
    end
  end
  dupe
end

#hide_commentsObject



5
6
7
8
9
# File 'lib/onigiri/hide_comments.rb', line 5

def hide_comments
  dupe = dup
  dupe.traverse {|elem| elem.remove if elem.class == Nokogiri::XML::Comment}
  dupe
end

#merge_by_tag(tag_name) ⇒ Object

This is going to be ugly



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/onigiri/merge_by_tag.rb', line 16

def merge_by_tag(tag_name)
  dupe = dup
  # First pass. Finding deepest <div>s that require merging upwards.
  mergers = dupe.find_merger_elements(tag_name)
  # Second pass. Traverse tree upwards from each merger <div> gathering attributes on our way
  mergers.each do |merger|
    data = singular_upverse(merger)
    merger.children.each do |survivor|
      data['root'] << survivor
    end
    data['deletion_node'].remove
    data['root']['class'] = data['class'] if data['class']
    data['root']['style'] = data['style'] if data['style']
  end
  dupe
end

#merge_divsObject



7
8
9
# File 'lib/onigiri/merge_by_tag.rb', line 7

def merge_divs
  self.merge_by_tag('div')
end

#merge_spansObject



11
12
13
# File 'lib/onigiri/merge_by_tag.rb', line 11

def merge_spans
  self.merge_by_tag('span')
end

#show_body_onlyObject



5
6
7
8
9
10
11
# File 'lib/onigiri/show_body_only.rb', line 5

def show_body_only
  dupe = self.css('body').empty? ? dup : Onigiri::Document.parse("")
  self.css('body').children.each do |child|
    dupe << child
  end
  dupe
end

#singular_upverse(node) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/onigiri/merge_by_tag.rb', line 42

def singular_upverse(node)
  if node.parent.name == node.name && !(node.next_sibling || node.previous_sibling)
    data = singular_upverse(node.parent)
    # If we got root node we should set a deletion point for root.
    # If we have a deletion point - no need to reset it.
    data['deletion_node'] ||= node if data['root']
  else
    data = Hash.new
    data['root'] = node
  end

  # Ensuring uglyness
  data['style'] ? (data['style'] += " #{node['style']}" if node['style']) : data['style'] = node['style']
  data['class'] ? (data['class'] += " #{node['class']}" if node['class']) : data['class'] = node['class']

  data
end