Class: Onigiri::Document
- Inherits:
-
Nokogiri::HTML::DocumentFragment
- Object
- Nokogiri::HTML::DocumentFragment
- Onigiri::Document
- Defined in:
- lib/onigiri.rb,
lib/onigiri/enclose_text.rb,
lib/onigiri/merge_by_tag.rb,
lib/onigiri/fix_backslash.rb,
lib/onigiri/hide_comments.rb,
lib/onigiri/show_body_only.rb,
lib/onigiri/drop_empty_paras.rb,
lib/onigiri/enclose_block_text.rb
Class Method Summary collapse
Instance Method Summary collapse
- #drop_empty_paras ⇒ Object
- #enclose_block_text ⇒ Object
- #enclose_text ⇒ Object
- #find_merger_elements(tag_name) ⇒ Object
- #fix_backslash ⇒ Object
- #hide_comments ⇒ Object
-
#merge_by_tag(tag_name) ⇒ Object
This is going to be ugly.
- #merge_divs ⇒ Object
- #merge_spans ⇒ Object
- #show_body_only ⇒ Object
- #singular_upverse(node) ⇒ Object
Class Method Details
.parse(tags) ⇒ Object
24 25 26 27 28 29 |
# File 'lib/onigiri.rb', line 24 def parse() # Remove formatting whitespaces # Those do not represent any data while messing up the tree = .gsub(/(\r|\n)/, '').gsub(/> *</, '><') if .respond_to? :gsub super end |
Instance Method Details
#drop_empty_paras ⇒ Object
5 6 7 8 9 10 11 |
# File 'lib/onigiri/drop_empty_paras.rb', line 5 def drop_empty_paras dupe = dup dupe.css('p').each do |p| p.remove if p.children.empty? end dupe end |
#enclose_block_text ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/onigiri/enclose_block_text.rb', line 5 def enclose_block_text dupe = dup = {"noscript" => 1, "form" => 1, "blockquote" => 1} dupe.traverse do |elem| if [elem.name] elem.children.each do |target| if target.text? target.add_previous_sibling "<p>#{target.content.strip}</p>" target.unlink end end end end dupe end |
#enclose_text ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/onigiri/enclose_text.rb', line 5 def enclose_text dupe = dup wrapper = Onigiri::Document.parse('<p>').child body = dupe.css('body').children body = dupe.children if body.empty? body.each do |target| if target.parent && (target.text? || target.description.inline?) wrap = target.add_previous_sibling(wrapper) wrap << target.unlink end end dupe end |
#find_merger_elements(tag_name) ⇒ Object
33 34 35 36 37 38 39 40 |
# File 'lib/onigiri/merge_by_tag.rb', line 33 def find_merger_elements(tag_name) result = [] self.css(tag_name).each do |elem| # !(node.next_sibling || node.previous_sibling) vs. node.parent.children.size result << elem if elem.parent.children.size == 1 && elem.parent.name == tag_name end result end |
#fix_backslash ⇒ Object
5 6 7 8 9 10 11 12 13 14 |
# File 'lib/onigiri/fix_backslash.rb', line 5 def fix_backslash dupe = dup attrset = ['src', 'longdesc', 'href', 'action'] dupe.css("[#{attrset.join('], [')}]").each do |target| attrset.each do |attr| target[attr] = target[attr].gsub("\\", "/") if target[attr] end end dupe end |
#hide_comments ⇒ Object
5 6 7 8 9 |
# File 'lib/onigiri/hide_comments.rb', line 5 def hide_comments dupe = dup dupe.traverse {|elem| elem.remove if elem.class == Nokogiri::XML::Comment} dupe end |
#merge_by_tag(tag_name) ⇒ Object
This is going to be ugly
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/onigiri/merge_by_tag.rb', line 16 def merge_by_tag(tag_name) dupe = dup # First pass. Finding deepest <div>s that require merging upwards. mergers = dupe.find_merger_elements(tag_name) # Second pass. Traverse tree upwards from each merger <div> gathering attributes on our way mergers.each do |merger| data = singular_upverse(merger) merger.children.each do |survivor| data['root'] << survivor end data['deletion_node'].remove data['root']['class'] = data['class'] if data['class'] data['root']['style'] = data['style'] if data['style'] end dupe end |
#merge_divs ⇒ Object
7 8 9 |
# File 'lib/onigiri/merge_by_tag.rb', line 7 def merge_divs self.merge_by_tag('div') end |
#merge_spans ⇒ Object
11 12 13 |
# File 'lib/onigiri/merge_by_tag.rb', line 11 def merge_spans self.merge_by_tag('span') end |
#show_body_only ⇒ Object
5 6 7 8 9 10 11 |
# File 'lib/onigiri/show_body_only.rb', line 5 def show_body_only dupe = self.css('body').empty? ? dup : Onigiri::Document.parse("") self.css('body').children.each do |child| dupe << child end dupe end |
#singular_upverse(node) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/onigiri/merge_by_tag.rb', line 42 def singular_upverse(node) if node.parent.name == node.name && !(node.next_sibling || node.previous_sibling) data = singular_upverse(node.parent) # If we got root node we should set a deletion point for root. # If we have a deletion point - no need to reset it. data['deletion_node'] ||= node if data['root'] else data = Hash.new data['root'] = node end # Ensuring uglyness data['style'] ? (data['style'] += " #{node['style']}" if node['style']) : data['style'] = node['style'] data['class'] ? (data['class'] += " #{node['class']}" if node['class']) : data['class'] = node['class'] data end |