Class: Rwspider::Document

Inherits:

Object

Object
Rwspider::Document

show all

Includes:: URI

Defined in:: lib/rwspider/document.rb

Instance Attribute Summary collapse

#as_downloaded ⇒ Object

Returns true if the Rwspider::Document::url was downloaded correctly.
#as_visited ⇒ Object

Returns true if the Rwspider::Document::url was request.
#documents ⇒ Object

An Array of Rwspider::Document found in the HTML code of the current Rwspider::Document.
#download_time ⇒ Object

The time spent to download the Rwspider::Document.
#http_response ⇒ Object

An instance of Net::HTTPResponse that contains the response returned from the web server.
#inbound_links ⇒ Object readonly

An Array of String hat contains the URLs of the documents where was found an link at the current Rwspider::Document.
#url ⇒ Object

instance of URI.

Instance Method Summary collapse

#get_images ⇒ Object

was different from ‘text/html’.
#get_links ⇒ Object

was different from ‘text/html’.
#get_other_files ⇒ Object

was different from ‘text/html’.
#initialize(url) ⇒ Document constructor

doc = Rwspider::Document::new(‘www.rwspider.com’).
#normalize_url(var) ⇒ Object

doc.normalize_url(URI.parse(‘/sitemap.html’)).
#parse(url) ⇒ Object

doc.parse(‘www.rwspider.com/sitemap.html’).

Constructor Details

#initialize(url) ⇒ `Document`

doc = Rwspider::Document::new(‘www.rwspider.com’)

# File 'lib/rwspider/document.rb', line 49

def initialize (url)
  parse(url)
  @tag_type = Array.new
  @tag_type << ['a','href']
  @tag_type << ['img','src']
  @tag_type << ['link','href']
  @inbound_links = []
  @documents = []
  @as_visited = false
end

Instance Attribute Details

#as_downloaded ⇒ `Object`

Returns true if the Rwspider::Document::url was downloaded correctly



39
40
41

# File 'lib/rwspider/document.rb', line 39

def as_downloaded
  @as_downloaded
end

#as_visited ⇒ `Object`

Returns true if the Rwspider::Document::url was request



27
28
29

# File 'lib/rwspider/document.rb', line 27

def as_visited
  @as_visited
end

#documents ⇒ `Object`

An Array of Rwspider::Document found in the HTML code of the current Rwspider::Document



33
34
35

# File 'lib/rwspider/document.rb', line 33

def documents
  @documents
end

#download_time ⇒ `Object`

The time spent to download the Rwspider::Document



36
37
38

# File 'lib/rwspider/document.rb', line 36

def download_time
  @download_time
end

#http_response ⇒ `Object`

An instance of Net::HTTPResponse that contains the response returned from the web server



30
31
32

# File 'lib/rwspider/document.rb', line 30

def http_response
  @http_response
end

#inbound_links ⇒ `Object` (readonly)

An Array of String hat contains the URLs of the documents where was found an link at the current Rwspider::Document



42
43
44

# File 'lib/rwspider/document.rb', line 42

def inbound_links
  @inbound_links
end

#url ⇒ `Object`

instance of URI



24
25
26

# File 'lib/rwspider/document.rb', line 24

def url
  @url
end

Instance Method Details

#get_images ⇒ `Object`

was different from ‘text/html’.



101
102
103

# File 'lib/rwspider/document.rb', line 101

def get_images()
  get_document(@tag_type[1])
end

#get_links ⇒ `Object`

was different from ‘text/html’.



85
86
87

# File 'lib/rwspider/document.rb', line 85

def get_links()
  get_document(@tag_type[0])
end

#get_other_files ⇒ `Object`

was different from ‘text/html’.



118
119
120

# File 'lib/rwspider/document.rb', line 118

def get_other_files() 
  get_document(@tag_type[2])
end

#normalize_url(var) ⇒ `Object`

doc.normalize_url(URI.parse(‘/sitemap.html’))

# File 'lib/rwspider/document.rb', line 127

def normalize_url(var)
  querystring = (!var.query.nil?) ? '?' + var.query : ''
  if  var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
    if var.relative?
      path = var.path
      if url.path.nil? 
        main_path = url.path.slice(0..url.path.rindex('/')) 
        else
        main_path = '/'
      end          
      
      if path.match('^\/')
        path = url.scheme + '://' + url.host  + path + querystring
        else            
        path = url.scheme + '://'  + url.host + main_path + path + querystring
      end
      else
      path = var.scheme + '://'  + var.host + var.path + querystring
    end
  end
  
  return path
end

#parse(url) ⇒ `Object`

doc.parse(‘www.rwspider.com/sitemap.html’)

# File 'lib/rwspider/document.rb', line 65

def parse (url)
  begin        
    @url = URI.parse(url.gsub(/\\/,'/'))        
    rescue Exception  => e
    nil   
  end
end

Class: Rwspider::Document

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Document

Instance Attribute Details

#as_downloaded ⇒ Object

#as_visited ⇒ Object

#documents ⇒ Object

#download_time ⇒ Object

#http_response ⇒ Object

#inbound_links ⇒ Object (readonly)

#url ⇒ Object

Instance Method Details

#get_images ⇒ Object

#get_links ⇒ Object

#get_other_files ⇒ Object

#normalize_url(var) ⇒ Object

#parse(url) ⇒ Object

#initialize(url) ⇒ `Document`

#as_downloaded ⇒ `Object`

#as_visited ⇒ `Object`

#documents ⇒ `Object`

#download_time ⇒ `Object`

#http_response ⇒ `Object`

#inbound_links ⇒ `Object` (readonly)

#url ⇒ `Object`

#get_images ⇒ `Object`

#get_links ⇒ `Object`

#get_other_files ⇒ `Object`

#normalize_url(var) ⇒ `Object`

#parse(url) ⇒ `Object`