Class: FeedMe::NokogiriUtil

Inherits:
Object
  • Object
show all
Defined in:
lib/nokogiri-util.rb

Instance Method Summary collapse

Instance Method Details

#clean_html(html) ⇒ Object

sanitize HTML todo: dup code to fix bugs



107
108
109
# File 'lib/nokogiri-util.rb', line 107

def clean_html(html)
  Sanitize.clean(html) 
end

#strip_html(html) ⇒ Object

strip all tags from HTML



96
97
98
# File 'lib/nokogiri-util.rb', line 96

def strip_html(html)
  Nokogiri::HTML(html).inner_text
end

#strip_truncate_html(html, words = 15, truncate_string = '...') ⇒ Object

strip tags from HTML and truncate to a certain number of words



101
102
103
# File 'lib/nokogiri-util.rb', line 101

def strip_truncate_html(html, words=15, truncate_string='...')
  strip_html(html).split[0..words].join(' ') + truncate_string
end

#truncate_html(text, num_words = 15, truncate_string = "...") ⇒ Object

Truncate HTML while preserving tags



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/nokogiri-util.rb', line 11

def truncate_html(text, num_words=15, truncate_string="...")
  doc = Nokogiri::HTML(html)
  current = doc.children.first
 count = 0

 while true
		# we found a text node
		if current.is_a?(Nokogiri::XML::Text)
			count += current.text.split.length
			# we reached our limit, let's get outta here!
			break if count > num_words
			previous = current
		end

		if current.children.length > 0
			# this node has children, can't be a text node,
			# lets descend and look for text nodes
			current = current.children.first
		elsif !current.next.nil?
			#this has no children, but has a sibling, let's check it out
			current = current.next
		else 
			# we are the last child, we need to ascend until we are
			# either done or find a sibling to continue on to
			n = current
			while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
				n = n.parent
			end

			# we've reached the top and found no more text nodes, break
			if n.is_a?(Nokogiri::HTML::Document)
				break;
			else
				current = n.parent.next
			end
		end
	end

	if count >= num_words
	  unless count == num_words
  		new_content = current.text.split

      # If we're here, the last text node we counted eclipsed the number of words
      # that we want, so we need to cut down on words.  The easiest way to think about
      # this is that without this node we'd have fewer words than the limit, so all
      # the previous words plus a limited number of words from this node are needed.
      # We simply need to figure out how many words are needed and grab that many.
      # Then we need to -subtract- an index, because the first word would be index zero.
  
      # For example, given:
      # <p>Testing this HTML truncater.</p><p>To see if its working.</p>
      # Let's say I want 6 words.  The correct returned string would be:
      # <p>Testing this HTML truncater.</p><p>To see...</p>
      # All the words in both paragraphs = 9
      # The last paragraph is the one that breaks the limit.  How many words would we
      # have without it? 4.  But we want up to 6, so we might as well get that many.
      # 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
      # we subtract 1.  If this gives us -1, we want nothing from this node. So go back to
      # the previous node instead.
      index = num_words-(count-new_content.length)-1
      if index >= 0
        new_content = new_content[0..index]
  		  current.content = new_content.join(' ') + truncate_string
		  else
		    current = previous
		    current.content = current.content + truncate_string
	    end
	  end

		# remove everything else
		while !current.is_a?(Nokogiri::HTML::Document)
			while !current.next.nil?
				current.next.remove
			end
			current = current.parent
		end
	end

	# now we grab the html and not the text.
	# we do first because nokogiri adds html and body tags
	# which we don't want
	doc.root.children.first.inner_html
end