Class: Tspider::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/tspider/page.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, attrs) ⇒ Page

Returns a new instance of Page.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/tspider/page.rb', line 10

def initialize(url,attrs)
  @url = url
  @uri = URI(@url)
  @user_agent = attrs[:user_agent] || ::Tspider::UA::DEFAULT
  @webrobots = WebRobots.new(@user_agent)
  @debug = false

  time_start = Time.now
  r = Client.get(@url, attrs.merge(:headers => {"User-Agent" => @user_agent}))
  time_end = Time.now
  @response = r
  @response_time = time_end - time_start
  @status = r.response.code.to_i

  @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace)
  @doc = Nokogiri::HTML(@html)
  @location = r.headers['location']
  @headers = r.headers.to_hash
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



8
9
10
# File 'lib/tspider/page.rb', line 8

def doc
  @doc
end

#headersObject (readonly)

Returns the value of attribute headers.



8
9
10
# File 'lib/tspider/page.rb', line 8

def headers
  @headers
end

#htmlObject (readonly)

Returns the value of attribute html.



8
9
10
# File 'lib/tspider/page.rb', line 8

def html
  @html
end

#locationObject (readonly)

Returns the value of attribute location.



8
9
10
# File 'lib/tspider/page.rb', line 8

def location
  @location
end

#responseObject (readonly)

Returns the value of attribute response.



8
9
10
# File 'lib/tspider/page.rb', line 8

def response
  @response
end

#response_timeObject (readonly)

Returns the value of attribute response_time.



8
9
10
# File 'lib/tspider/page.rb', line 8

def response_time
  @response_time
end

#statusObject (readonly)

Returns the value of attribute status.



8
9
10
# File 'lib/tspider/page.rb', line 8

def status
  @status
end

#urlObject (readonly)

Returns the value of attribute url.



8
9
10
# File 'lib/tspider/page.rb', line 8

def url
  @url
end

Instance Method Details

#canonicalObject



91
92
93
# File 'lib/tspider/page.rb', line 91

def canonical
  safe_search('link[@rel="canonical"]', [0, 'href'])
end


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/tspider/page.rb', line 72

def links
  links = []
  safe_search('a').each do |a|
    href = a['href']
    text = a.content
    rel = a['rel']

    url = @uri.merge(URI.escape(href.to_s))
    if url.host == @uri.host
      disallow = @webrobots.disallowed?(url.to_s)
    else
      disallow = nil
    end

    links << {href: href, text: text, rel: rel, disallow: disallow}
  end
  links
end

#meta(name) ⇒ Object



64
65
66
# File 'lib/tspider/page.rb', line 64

def meta(name)
  safe_search("meta[@name=#{name}]", [0, 'content'])
end

#meta_descriptionObject



52
53
54
# File 'lib/tspider/page.rb', line 52

def meta_description
  safe_search('meta[@name="description"]', [0, 'content'])
end

#meta_keywordsObject



56
57
58
# File 'lib/tspider/page.rb', line 56

def meta_keywords
  safe_search('meta[@name="keywords"]', [0, 'content'])
end

#meta_robotsObject



60
61
62
# File 'lib/tspider/page.rb', line 60

def meta_robots
  safe_search('meta[@name="robots"]', [0, 'content'])
end

#opfObject



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/tspider/page.rb', line 30

def opf
  {:url => @url,
   :status => @status,
   :location => @Location,
   :response_time => @response_time,
   :canonical => canonical,
   :title => title,
   :meta_keywords => meta_keywords,
   :meta_description => meta_description,
   :meta_robots => meta_robots,
   :h1 => h1,
   :h2 => h2,
   :h3 => h3,
   :links => links,
   :headers => @headers
  }
end

#response_header(key) ⇒ Object



68
69
70
# File 'lib/tspider/page.rb', line 68

def response_header(key)
  @headers[key].join('|')
end

#safe_search(search_value, select_path = [], return_content = false) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/tspider/page.rb', line 105

def safe_search(search_value, select_path=[], return_content=false)
  value = @doc.search(search_value).dup
  select_path.each do |key|
    begin
      value = value[key]
    rescue NoMethodError
      return nil
    end
  end

  return nil if value.nil?
  if return_content
    value.content
  else
    value
  end
end

#titleObject



48
49
50
# File 'lib/tspider/page.rb', line 48

def title
  safe_search('title', [0], true)
end