Class: Rwspider::Document

Inherits:
Object
  • Object
show all
Includes:
URI
Defined in:
lib/rwspider/document.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Document

doc = Rwspider::Document::new(‘www.rwspider.com’)



49
50
51
52
53
54
55
56
57
58
# File 'lib/rwspider/document.rb', line 49

def initialize (url)
  parse(url)
  @tag_type = Array.new
  @tag_type << ['a','href']
  @tag_type << ['img','src']
  @tag_type << ['link','href']
  @inbound_links = []
  @documents = []
  @as_visited = false
end

Instance Attribute Details

#as_downloadedObject

Returns true if the Rwspider::Document::url was downloaded correctly



39
40
41
# File 'lib/rwspider/document.rb', line 39

def as_downloaded
  @as_downloaded
end

#as_visitedObject

Returns true if the Rwspider::Document::url was request



27
28
29
# File 'lib/rwspider/document.rb', line 27

def as_visited
  @as_visited
end

#documentsObject

An Array of Rwspider::Document found in the HTML code of the current Rwspider::Document



33
34
35
# File 'lib/rwspider/document.rb', line 33

def documents
  @documents
end

#download_timeObject

The time spent to download the Rwspider::Document



36
37
38
# File 'lib/rwspider/document.rb', line 36

def download_time
  @download_time
end

#http_responseObject

An instance of Net::HTTPResponse that contains the response returned from the web server



30
31
32
# File 'lib/rwspider/document.rb', line 30

def http_response
  @http_response
end

An Array of String hat contains the URLs of the documents where was found an link at the current Rwspider::Document



42
43
44
# File 'lib/rwspider/document.rb', line 42

def inbound_links
  @inbound_links
end

#urlObject

instance of URI



24
25
26
# File 'lib/rwspider/document.rb', line 24

def url
  @url
end

Instance Method Details

#get_imagesObject

was different from ‘text/html’.



101
102
103
# File 'lib/rwspider/document.rb', line 101

def get_images()
  get_document(@tag_type[1])
end

was different from ‘text/html’.



85
86
87
# File 'lib/rwspider/document.rb', line 85

def get_links()
  get_document(@tag_type[0])
end

#get_other_filesObject

was different from ‘text/html’.



118
119
120
# File 'lib/rwspider/document.rb', line 118

def get_other_files() 
  get_document(@tag_type[2])
end

#normalize_url(var) ⇒ Object

doc.normalize_url(URI.parse(‘/sitemap.html’))



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/rwspider/document.rb', line 127

def normalize_url(var)
  querystring = (!var.query.nil?) ? '?' + var.query : ''
  if  var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
    if var.relative?
      path = var.path
      if url.path.nil? 
        main_path = url.path.slice(0..url.path.rindex('/')) 
        else
        main_path = '/'
      end          
      
      if path.match('^\/')
        path = url.scheme + '://' + url.host  + path + querystring
        else            
        path = url.scheme + '://'  + url.host + main_path + path + querystring
      end
      else
      path = var.scheme + '://'  + var.host + var.path + querystring
    end
  end
  
  return path
end

#parse(url) ⇒ Object



65
66
67
68
69
70
71
# File 'lib/rwspider/document.rb', line 65

def parse (url)
  begin        
    @url = URI.parse(url.gsub(/\\/,'/'))        
    rescue Exception  => e
    nil   
  end
end