Class: FeedMe::NokogiriUtil

Inherits:

Object

Object
FeedMe::NokogiriUtil

show all

Defined in:: lib/nokogiri-util.rb

Instance Method Summary collapse

#clean_html(html) ⇒ Object

sanitize HTML todo: dup code to fix bugs.
#strip_html(html) ⇒ Object

strip all tags from HTML.
#strip_truncate_html(html, words = 15, truncate_string = '...') ⇒ Object

strip tags from HTML and truncate to a certain number of words.
#truncate_html(text, num_words = 15, truncate_string = "...") ⇒ Object

Truncate HTML while preserving tags.

Instance Method Details

#clean_html(html) ⇒ `Object`

sanitize HTML todo: dup code to fix bugs



107
108
109

# File 'lib/nokogiri-util.rb', line 107

def clean_html(html)
  Sanitize.clean(html) 
end

#strip_html(html) ⇒ `Object`

strip all tags from HTML



96
97
98

# File 'lib/nokogiri-util.rb', line 96

def strip_html(html)
  Nokogiri::HTML(html).inner_text
end

#strip_truncate_html(html, words = 15, truncate_string = '...') ⇒ `Object`

strip tags from HTML and truncate to a certain number of words



101
102
103

# File 'lib/nokogiri-util.rb', line 101

def strip_truncate_html(html, words=15, truncate_string='...')
  strip_html(html).split[0..words].join(' ') + truncate_string
end

#truncate_html(text, num_words = 15, truncate_string = "...") ⇒ `Object`

Truncate HTML while preserving tags

# File 'lib/nokogiri-util.rb', line 11

def truncate_html(text, num_words=15, truncate_string="...")
  doc = Nokogiri::HTML(html)
  current = doc.children.first
 count = 0

 while true
		# we found a text node
		if current.is_a?(Nokogiri::XML::Text)
			count += current.text.split.length
			# we reached our limit, let's get outta here!
			break if count > num_words
			previous = current
		end

		if current.children.length > 0
			# this node has children, can't be a text node,
			# lets descend and look for text nodes
			current = current.children.first
		elsif !current.next.nil?
			#this has no children, but has a sibling, let's check it out
			current = current.next
		else 
			# we are the last child, we need to ascend until we are
			# either done or find a sibling to continue on to
			n = current
			while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
				n = n.parent
			end

			# we've reached the top and found no more text nodes, break
			if n.is_a?(Nokogiri::HTML::Document)
				break;
			else
				current = n.parent.next
			end
		end
	end

	if count >= num_words
	  unless count == num_words
  		new_content = current.text.split

      # If we're here, the last text node we counted eclipsed the number of words
      # that we want, so we need to cut down on words.  The easiest way to think about
      # this is that without this node we'd have fewer words than the limit, so all
      # the previous words plus a limited number of words from this node are needed.
      # We simply need to figure out how many words are needed and grab that many.
      # Then we need to -subtract- an index, because the first word would be index zero.
  
      # For example, given:
      # <p>Testing this HTML truncater.</p><p>To see if its working.</p>
      # Let's say I want 6 words.  The correct returned string would be:
      # <p>Testing this HTML truncater.</p><p>To see...</p>
      # All the words in both paragraphs = 9
      # The last paragraph is the one that breaks the limit.  How many words would we
      # have without it? 4.  But we want up to 6, so we might as well get that many.
      # 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
      # we subtract 1.  If this gives us -1, we want nothing from this node. So go back to
      # the previous node instead.
      index = num_words-(count-new_content.length)-1
      if index >= 0
        new_content = new_content[0..index]
  		  current.content = new_content.join(' ') + truncate_string
		  else
		    current = previous
		    current.content = current.content + truncate_string
	    end
	  end

		# remove everything else
		while !current.is_a?(Nokogiri::HTML::Document)
			while !current.next.nil?
				current.next.remove
			end
			current = current.parent
		end
	end

	# now we grab the html and not the text.
	# we do first because nokogiri adds html and body tags
	# which we don't want
	doc.root.children.first.inner_html
end

Class: FeedMe::NokogiriUtil

Instance Method Summary collapse

Instance Method Details

#clean_html(html) ⇒ Object

#strip_html(html) ⇒ Object

#strip_truncate_html(html, words = 15, truncate_string = '...') ⇒ Object

#truncate_html(text, num_words = 15, truncate_string = "...") ⇒ Object

#clean_html(html) ⇒ `Object`

#strip_html(html) ⇒ `Object`

#strip_truncate_html(html, words = 15, truncate_string = '...') ⇒ `Object`

#truncate_html(text, num_words = 15, truncate_string = "...") ⇒ `Object`