Class: Webpage
- Inherits:
-
Object
- Object
- Webpage
- Defined in:
- lib/webpage.rb
Instance Attribute Summary collapse
-
#nokogiri ⇒ Object
readonly
Returns the value of attribute nokogiri.
Instance Method Summary collapse
- #[](tag) ⇒ Object
- #canonical ⇒ Object
- #comments ⇒ Object
- #description ⇒ Object
- #h1 ⇒ Object
-
#initialize(body, options = {}) ⇒ Webpage
constructor
A new instance of Webpage.
- #keywords ⇒ Object
- #link_to?(target_uri) ⇒ Boolean
- #link_to_host?(host) ⇒ Boolean
- #links ⇒ Object
- #links_to_different_domain ⇒ Object
- #links_to_different_host ⇒ Object
- #nodes_with(key) ⇒ Object
- #text ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(body, options = {}) ⇒ Webpage
Returns a new instance of Webpage.
5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/webpage.rb', line 5 def initialize(body,={}) raise ArgumentError 'body cannot be empty' unless body = #@body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding @nokogiri = Nokogiri::HTML(body) if .has_key?:uri @uri = fuzzy_uri([:uri]) raise '@uri should be absolute' unless @uri.absolute? @host = @uri.host end @domain = [:domain] end |
Instance Attribute Details
#nokogiri ⇒ Object (readonly)
Returns the value of attribute nokogiri.
4 5 6 |
# File 'lib/webpage.rb', line 4 def nokogiri @nokogiri end |
Instance Method Details
#[](tag) ⇒ Object
33 34 35 36 37 38 |
# File 'lib/webpage.rb', line 33 def [] (tag) return @nokogiri.xpath("//link[@rel='canonical']") if tag == 'canonical' return @nokogiri.xpath("//meta[@name='keywords']") if tag == 'keywords' return @nokogiri.xpath("//meta[@name='description']") if tag == 'description' return @nokogiri.xpath("//#{tag}") end |
#canonical ⇒ Object
29 30 31 |
# File 'lib/webpage.rb', line 29 def canonical self['canonical'].first['href'] end |
#comments ⇒ Object
87 88 89 |
# File 'lib/webpage.rb', line 87 def comments @nokogiri.xpath("//comment()").map{|comment|comment.to_s}.delete_if{|comment|comment.downcase.start_with?'[if ie' or comment.downcase.include?'google' or comment.downcase.include?'baidu'} end |
#description ⇒ Object
48 49 50 |
# File 'lib/webpage.rb', line 48 def description @description ||= @nokogiri.xpath("//meta[@name='description']").map{||['content']}.flatten.join end |
#h1 ⇒ Object
17 18 19 |
# File 'lib/webpage.rb', line 17 def h1 @nokogiri.xpath("//h1").text end |
#keywords ⇒ Object
44 45 46 |
# File 'lib/webpage.rb', line 44 def keywords @keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{||['content']}.flatten.join.split(',') end |
#link_to?(target_uri) ⇒ Boolean
58 59 60 61 62 63 64 65 |
# File 'lib/webpage.rb', line 58 def link_to?(target_uri) links.any?{|link| #p make_href_absolute(link['href'].to_s) fuzzy_uri(link['href'].to_s).equal? fuzzy_uri(target_uri) } #links.any?{|link|fuzzy_uri(make_href_absolute(link['href'].to_s)).equal? fuzzy_uri(target_uri)} end |
#link_to_host?(host) ⇒ Boolean
67 68 69 |
# File 'lib/webpage.rb', line 67 def link_to_host?(host) links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host} end |
#links ⇒ Object
52 53 54 55 56 |
# File 'lib/webpage.rb', line 52 def links @links ||= %w(a area).map do |tag| @nokogiri.xpath("//#{tag}") end.flatten end |
#links_to_different_domain ⇒ Object
80 81 82 83 84 85 86 |
# File 'lib/webpage.rb', line 80 def links_to_different_domain raise '@domain cannot be empty' unless @domain @links_to_different_domain ||= links.delete_if do|link| uri = fuzzy_uri(link['href'].to_s) uri.host and uri.host.end_with?@domain end end |
#links_to_different_host ⇒ Object
71 72 73 74 75 76 77 78 |
# File 'lib/webpage.rb', line 71 def links_to_different_host raise '@host cannot be empty' unless @host @links_to_different_host ||= links.delete_if do|link| #fuzzy_uri(link['href'].to_s).host == @host or fuzzy_uri(link['href'].to_s).host.to_s.empty? uri = fuzzy_uri(link['href'].to_s) uri.host.to_s.empty? or uri.host == @host end end |
#nodes_with(key) ⇒ Object
40 41 42 |
# File 'lib/webpage.rb', line 40 def nodes_with(key) @nokogiri.xpath("//@#{key}") end |
#text ⇒ Object
24 25 26 27 |
# File 'lib/webpage.rb', line 24 def text @nokogiri.xpath("//text()").text #return body.gsub(/<\/?[^>]*>/, "") end |
#title ⇒ Object
20 21 22 |
# File 'lib/webpage.rb', line 20 def title @nokogiri.xpath("//title").text end |