Class: RDig::HttpDocument

Inherits:
Document show all
Defined in:
lib/rdig/documents.rb

Overview

Remote Document to be retrieved by HTTP

Instance Attribute Summary collapse

Attributes inherited from Document

#content, #content_type, #uri

Instance Method Summary collapse

Methods inherited from Document

#body, create, #has_content?, #links, #needs_indexing?, #title, #to_s

Constructor Details

#initialize(args = {}) ⇒ HttpDocument

url: url of this document, may be relative to the referring doc or host. referrer: uri of the document we retrieved this link from



112
113
114
115
116
# File 'lib/rdig/documents.rb', line 112

def initialize(args={})
  super(args)
  @referring_uri = args[:referrer]
  @depth = args[:depth] || 0
end

Instance Attribute Details

#depthObject (readonly)

counts how far this document is away from one of the start urls. Used to limit crawling by depth.



101
102
103
# File 'lib/rdig/documents.rb', line 101

def depth
  @depth
end

#etagObject (readonly)

Returns the value of attribute etag.



104
105
106
# File 'lib/rdig/documents.rb', line 104

def etag
  @etag
end

#referring_uriObject (readonly)

Returns the value of attribute referring_uri.



102
103
104
# File 'lib/rdig/documents.rb', line 102

def referring_uri
  @referring_uri
end

#statusObject (readonly)

Returns the value of attribute status.



103
104
105
# File 'lib/rdig/documents.rb', line 103

def status
  @status
end

Instance Method Details

#create_child(uri) ⇒ Object



106
107
108
# File 'lib/rdig/documents.rb', line 106

def create_child(uri)
  HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i 
end

#fetchObject



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/rdig/documents.rb', line 118

def fetch
  RDig.logger.debug "fetching #{@uri.to_s}"
  open(@uri.to_s, RDig::open_uri_http_options) do |doc|
    if @uri.to_s != doc.base_uri.to_s
      @status = :redirect
      @content = doc.base_uri
    else
      case doc.status.first.to_i
      when 200
        @etag = doc.meta['etag']
        @content = ContentExtractors.process(doc.read, doc.content_type)
        @status = :success
      when 404
        RDig.logger.info "got 404 for #{@uri}"
      else
        RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
      end
    end
  end
rescue
  RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
ensure
  @content ||= {}
end