Class: Webpage

Inherits:
Object
  • Object
show all
Defined in:
lib/webpage.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(body, options = {}) ⇒ Webpage

Returns a new instance of Webpage.



5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/webpage.rb', line 5

def initialize(body,options={})
    raise ArgumentError 'body cannot be empty' unless body
    @options = options
    #@body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
    @nokogiri = Nokogiri::HTML(body)
    if options.has_key?:uri
        @uri = fuzzy_uri(@options[:uri])
        raise '@uri should be absolute' unless @uri.absolute?
        @host = @uri.host
    end
    @domain = options[:domain]
end

Instance Attribute Details

#nokogiriObject (readonly)

Returns the value of attribute nokogiri.



4
5
6
# File 'lib/webpage.rb', line 4

def nokogiri
  @nokogiri
end

Instance Method Details

#[](tag) ⇒ Object



33
34
35
36
37
38
# File 'lib/webpage.rb', line 33

def [] (tag)
    return @nokogiri.xpath("//link[@rel='canonical']") if tag == 'canonical'
    return @nokogiri.xpath("//meta[@name='keywords']") if tag == 'keywords'
    return @nokogiri.xpath("//meta[@name='description']") if tag == 'description'
    return @nokogiri.xpath("//#{tag}")
end

#canonicalObject



29
30
31
# File 'lib/webpage.rb', line 29

def canonical
    self['canonical'].first['href']
end

#commentsObject



87
88
89
# File 'lib/webpage.rb', line 87

def comments
    @nokogiri.xpath("//comment()").map{|comment|comment.to_s}.delete_if{|comment|comment.downcase.start_with?'[if ie' or comment.downcase.include?'google' or comment.downcase.include?'baidu'}
end

#descriptionObject



48
49
50
# File 'lib/webpage.rb', line 48

def description
    @description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
end

#h1Object



17
18
19
# File 'lib/webpage.rb', line 17

def h1
    @nokogiri.xpath("//h1").text
end

#keywordsObject



44
45
46
# File 'lib/webpage.rb', line 44

def keywords
    @keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
end

Returns:

  • (Boolean)


58
59
60
61
62
63
64
65
# File 'lib/webpage.rb', line 58

def link_to?(target_uri)
    links.any?{|link|
        #p make_href_absolute(link['href'].to_s)
        fuzzy_uri(link['href'].to_s).equal? fuzzy_uri(target_uri)
    }

    #links.any?{|link|fuzzy_uri(make_href_absolute(link['href'].to_s)).equal? fuzzy_uri(target_uri)}
end

Returns:

  • (Boolean)


67
68
69
# File 'lib/webpage.rb', line 67

def link_to_host?(host)
    links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
end


52
53
54
55
56
# File 'lib/webpage.rb', line 52

def links
    @links ||= %w(a area).map do |tag|
        @nokogiri.xpath("//#{tag}")
    end.flatten
end


80
81
82
83
84
85
86
# File 'lib/webpage.rb', line 80

def links_to_different_domain
    raise '@domain cannot be empty' unless @domain
    @links_to_different_domain ||= links.delete_if do|link|
        uri = fuzzy_uri(link['href'].to_s)
        uri.host and uri.host.end_with?@domain
    end
end


71
72
73
74
75
76
77
78
# File 'lib/webpage.rb', line 71

def links_to_different_host
    raise '@host cannot be empty' unless @host
    @links_to_different_host ||= links.delete_if do|link|
        #fuzzy_uri(link['href'].to_s).host == @host or fuzzy_uri(link['href'].to_s).host.to_s.empty?
        uri = fuzzy_uri(link['href'].to_s)
        uri.host.to_s.empty? or uri.host == @host
    end
end

#nodes_with(key) ⇒ Object



40
41
42
# File 'lib/webpage.rb', line 40

def nodes_with(key)
    @nokogiri.xpath("//@#{key}")
end

#textObject



24
25
26
27
# File 'lib/webpage.rb', line 24

def text
    @nokogiri.xpath("//text()").text
    #return body.gsub(/<\/?[^>]*>/, "")
end

#titleObject



20
21
22
# File 'lib/webpage.rb', line 20

def title
    @nokogiri.xpath("//title").text
end