Class: Tspider::Page
- Inherits:
-
Object
- Object
- Tspider::Page
- Defined in:
- lib/tspider/page.rb
Instance Attribute Summary collapse
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#headers ⇒ Object
readonly
Returns the value of attribute headers.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#location ⇒ Object
readonly
Returns the value of attribute location.
-
#response ⇒ Object
readonly
Returns the value of attribute response.
-
#response_time ⇒ Object
readonly
Returns the value of attribute response_time.
-
#status ⇒ Object
readonly
Returns the value of attribute status.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #canonical ⇒ Object
-
#initialize(url, attrs) ⇒ Page
constructor
A new instance of Page.
- #links ⇒ Object
- #meta(name) ⇒ Object
- #meta_description ⇒ Object
- #meta_keywords ⇒ Object
- #meta_robots ⇒ Object
- #opf ⇒ Object
- #response_header(key) ⇒ Object
- #safe_search(search_value, select_path = [], return_content = false) ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(url, attrs) ⇒ Page
Returns a new instance of Page.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/tspider/page.rb', line 10 def initialize(url,attrs) @url = url @uri = URI(@url) @user_agent = attrs[:user_agent] || ::Tspider::UA::DEFAULT @webrobots = WebRobots.new(@user_agent) @debug = false time_start = Time.now r = Client.get(@url, attrs.merge(:headers => {"User-Agent" => @user_agent})) time_end = Time.now @response = r @response_time = time_end - time_start @status = r.response.code.to_i @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace) @doc = Nokogiri::HTML(@html) @location = r.headers['location'] @headers = r.headers.to_hash end |
Instance Attribute Details
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def doc @doc end |
#headers ⇒ Object (readonly)
Returns the value of attribute headers.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def headers @headers end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def html @html end |
#location ⇒ Object (readonly)
Returns the value of attribute location.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def location @location end |
#response ⇒ Object (readonly)
Returns the value of attribute response.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def response @response end |
#response_time ⇒ Object (readonly)
Returns the value of attribute response_time.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def response_time @response_time end |
#status ⇒ Object (readonly)
Returns the value of attribute status.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def status @status end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def url @url end |
Instance Method Details
#canonical ⇒ Object
91 92 93 |
# File 'lib/tspider/page.rb', line 91 def canonical safe_search('link[@rel="canonical"]', [0, 'href']) end |
#links ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/tspider/page.rb', line 72 def links links = [] safe_search('a').each do |a| href = a['href'] text = a.content rel = a['rel'] url = @uri.merge(URI.escape(href.to_s)) if url.host == @uri.host disallow = @webrobots.disallowed?(url.to_s) else disallow = nil end links << {href: href, text: text, rel: rel, disallow: disallow} end links end |
#meta(name) ⇒ Object
64 65 66 |
# File 'lib/tspider/page.rb', line 64 def (name) safe_search("meta[@name=#{name}]", [0, 'content']) end |
#meta_description ⇒ Object
52 53 54 |
# File 'lib/tspider/page.rb', line 52 def safe_search('meta[@name="description"]', [0, 'content']) end |
#meta_keywords ⇒ Object
56 57 58 |
# File 'lib/tspider/page.rb', line 56 def safe_search('meta[@name="keywords"]', [0, 'content']) end |
#meta_robots ⇒ Object
60 61 62 |
# File 'lib/tspider/page.rb', line 60 def safe_search('meta[@name="robots"]', [0, 'content']) end |
#opf ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/tspider/page.rb', line 30 def opf {:url => @url, :status => @status, :location => @Location, :response_time => @response_time, :canonical => canonical, :title => title, :meta_keywords => , :meta_description => , :meta_robots => , :h1 => h1, :h2 => h2, :h3 => h3, :links => links, :headers => @headers } end |
#response_header(key) ⇒ Object
68 69 70 |
# File 'lib/tspider/page.rb', line 68 def response_header(key) @headers[key].join('|') end |
#safe_search(search_value, select_path = [], return_content = false) ⇒ Object
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/tspider/page.rb', line 105 def safe_search(search_value, select_path=[], return_content=false) value = @doc.search(search_value).dup select_path.each do |key| begin value = value[key] rescue NoMethodError return nil end end return nil if value.nil? if return_content value.content else value end end |
#title ⇒ Object
48 49 50 |
# File 'lib/tspider/page.rb', line 48 def title safe_search('title', [0], true) end |