Class: Tspider::Page
- Inherits:
-
Object
- Object
- Tspider::Page
- Defined in:
- lib/tspider/page.rb
Instance Attribute Summary collapse
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#location ⇒ Object
readonly
Returns the value of attribute location.
-
#response ⇒ Object
readonly
Returns the value of attribute response.
-
#response_time ⇒ Object
readonly
Returns the value of attribute response_time.
-
#status ⇒ Object
readonly
Returns the value of attribute status.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #canonical ⇒ Object
-
#initialize(attrs) ⇒ Page
constructor
A new instance of Page.
- #links ⇒ Object
- #meta_description ⇒ Object
- #meta_keywords ⇒ Object
- #meta_robots ⇒ Object
- #opf ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(attrs) ⇒ Page
Returns a new instance of Page.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/tspider/page.rb', line 10 def initialize(attrs) @url = attrs[:url] @uri = URI(@url) @user_agent = attrs[:user_agent] @webrobots = WebRobots.new(@user_agent) @debug = false time_start = Time.now r = Client.get(@url, :headers => {"User-Agent" => @user_agent}) time_end = Time.now @response = r @response_time = time_end - time_start @status = r.response.code.to_i @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace) @doc = Nokogiri::HTML(@html) @location = r.headers['location'] @headers = r.headers.to_hash end |
Instance Attribute Details
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def doc @doc end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def html @html end |
#location ⇒ Object (readonly)
Returns the value of attribute location.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def location @location end |
#response ⇒ Object (readonly)
Returns the value of attribute response.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def response @response end |
#response_time ⇒ Object (readonly)
Returns the value of attribute response_time.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def response_time @response_time end |
#status ⇒ Object (readonly)
Returns the value of attribute status.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def status @status end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
8 9 10 |
# File 'lib/tspider/page.rb', line 8 def url @url end |
Instance Method Details
#canonical ⇒ Object
83 84 85 |
# File 'lib/tspider/page.rb', line 83 def canonical safe_search('link[@rel="canonical"]',[0,'href']) end |
#links ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/tspider/page.rb', line 64 def links links = [] safe_search('a').each do |a| href = a['href'] text = a.content.strip rel = a['rel'] url = @uri.merge(URI.escape(href.to_s)) if url.host == @uri.host disallow = @webrobots.disallowed?(url.to_s) else disallow = nil end links << {href: href, text: text, rel: rel, disallow: disallow} end links end |
#meta_description ⇒ Object
52 53 54 |
# File 'lib/tspider/page.rb', line 52 def safe_search('meta[@name="description"]',[0,'content']) end |
#meta_keywords ⇒ Object
56 57 58 |
# File 'lib/tspider/page.rb', line 56 def safe_search('meta[@name="keywords"]',[0,'content']) end |
#meta_robots ⇒ Object
60 61 62 |
# File 'lib/tspider/page.rb', line 60 def safe_search('meta[@name="robots"]',[0,'content']) end |
#opf ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/tspider/page.rb', line 30 def opf {:url => @url, :status => @status, :location => @Location, :response_time => @response_time, :canonical => canonical, :title => title, :meta_keywords => , :meta_description => , :meta_robots => , :h1 => h1, :h2 => h2, :h3 => h3, :links => links, :headers => @headers } end |
#title ⇒ Object
48 49 50 |
# File 'lib/tspider/page.rb', line 48 def title safe_search('title',[0,'content']) end |