Class: Robotstxt::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/robotstxt/parser.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(robot_id = nil) ⇒ Parser

Initializes a new Robots::Robotstxtistance with robot_id option.

client = Robotstxt::Robotstxtistance.new('my_robot_id')



29
30
31
32
33
34
35
36
# File 'lib/robotstxt/parser.rb', line 29

def initialize(robot_id = nil)
	
	@robot_id = '*'
	@rules = []
	@sitemaps = []
	@robot_id = robot_id.downcase if !robot_id.nil?
	
end

Instance Attribute Details

#bodyObject (readonly)

Returns the value of attribute body.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def body
  @body
end

#foundObject (readonly)

Returns the value of attribute found.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def found
  @found
end

#robot_idObject

Returns the value of attribute robot_id.



22
23
24
# File 'lib/robotstxt/parser.rb', line 22

def robot_id
  @robot_id
end

#rulesObject (readonly)

Returns the value of attribute rules.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def rules
  @rules
end

#sitemapsObject (readonly)

Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
if client.get('http://www.simonerinzivillo.it')
  client.sitemaps.each{ |url|
  puts url
}
end


125
126
127
# File 'lib/robotstxt/parser.rb', line 125

def sitemaps
  @sitemaps
end

Instance Method Details

#allowed?(var) ⇒ Boolean

Check if the URL is allowed to be crawled from the current Robot_id.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
if client.get('http://www.simonerinzivillo.it')
  client.allowed?('http://www.simonerinzivillo.it/no-dir/')
end

This method returns true if the robots.txt file does not block the access to the URL.

Returns:

  • (Boolean)


94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/robotstxt/parser.rb', line 94

def allowed?(var)
	is_allow = true
	url = URI.parse(var)
	querystring = (!url.query.nil?) ? '?' + url.query : ''
	url_path = url.path + querystring
	
	@rules.each {|ua|
		
		if @robot_id == ua[0] || ua[0] == '*' 
			
			ua[1].each {|d|
				
				is_allow = false if url_path.match('^' + d ) || d == '/'
				
			}
			
		end
		
	}
	is_allow
end

#found?Boolean

This method returns true if the Robots.txt parsing is gone.

Returns:

  • (Boolean)


131
132
133
# File 'lib/robotstxt/parser.rb', line 131

def found?
	!!@found
end

#get(hostname) ⇒ Object

Requires and parses the Robots.txt file for the hostname.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
client.get('http://www.simonerinzivillo.it')

This method returns true if the parsing is gone.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/robotstxt/parser.rb', line 47

def get(hostname)
	
	@ehttp = true
	url = URI.parse(hostname)
	
	begin
		http = Net::HTTP.new(url.host, url.port)
		if url.scheme == 'https'
			http.verify_mode = OpenSSL::SSL::VERIFY_NONE
			http.use_ssl = true 
		end
		
		response =  http.request(Net::HTTP::Get.new('/robots.txt'))
		
		case response
			when Net::HTTPSuccess then
			@found = true
			@body = response.body
			parse()		
			
			else
			@found = false
		end 
		
		return @found
		
		rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
		if @ehttp
			@ettp = false
			retry 
			else
			return nil
		end
	end
	
end