Class: Robotstxt::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/robotstxt/parser.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(robot_id = nil) ⇒ Parser

Initializes a new Robots::Robotstxtistance with robot_id option.

client = Robotstxt::Robotstxtistance.new('my_robot_id')



29
30
31
32
33
34
35
36
# File 'lib/robotstxt/parser.rb', line 29

def initialize(robot_id = nil)
  
  @robot_id = '*'
  @rules = []
  @sitemaps = []
  @robot_id = robot_id.downcase if !robot_id.nil?
  
end

Instance Attribute Details

#bodyObject (readonly)

Returns the value of attribute body.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def body
  @body
end

#foundObject (readonly)

Returns the value of attribute found.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def found
  @found
end

#robot_idObject

Returns the value of attribute robot_id.



22
23
24
# File 'lib/robotstxt/parser.rb', line 22

def robot_id
  @robot_id
end

#rulesObject (readonly)

Returns the value of attribute rules.



23
24
25
# File 'lib/robotstxt/parser.rb', line 23

def rules
  @rules
end

#sitemapsObject (readonly)

Analyze the robots.txt file to return an Array containing the list of XML Sitemaps URLs.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
if client.get('http://www.simonerinzivillo.it')
  client.sitemaps.each{ |url|
  puts url
}
end


125
126
127
# File 'lib/robotstxt/parser.rb', line 125

def sitemaps
  @sitemaps
end

Instance Method Details

#allowed?(var) ⇒ Boolean

Check if the URL is allowed to be crawled from the current Robot_id.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
if client.get('http://www.simonerinzivillo.it')
  client.allowed?('http://www.simonerinzivillo.it/no-dir/')
end

This method returns true if the robots.txt file does not block the access to the URL.

Returns:

  • (Boolean)


94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/robotstxt/parser.rb', line 94

def allowed?(var)
  is_allow = true
  url = URI.parse(var)
  querystring = (!url.query.nil?) ? '?' + url.query : ''
  url_path = url.path + querystring
  
  @rules.each {|ua|
    
    if @robot_id == ua[0] || ua[0] == '*' 
      
      ua[1].each {|d|
        
        is_allow = false if url_path.match('^' + d ) || d == '/'
        
      }
      
    end
    
  }
  is_allow
end

#found?Boolean

This method returns true if the Robots.txt parsing is gone.

Returns:

  • (Boolean)


131
132
133
# File 'lib/robotstxt/parser.rb', line 131

def found?
  !!@found
end

#get(hostname) ⇒ Object

Requires and parses the Robots.txt file for the hostname.

client = Robotstxt::Robotstxtistance.new('my_robot_id')
client.get('http://www.simonerinzivillo.it')

This method returns true if the parsing is gone.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/robotstxt/parser.rb', line 47

def get(hostname)
  
  @ehttp = true
  url = URI.parse(hostname)
  
  begin
    http = Net::HTTP.new(url.host, url.port)
    if url.scheme == 'https'
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
      http.use_ssl = true 
    end
    
    response =  http.request(Net::HTTP::Get.new('/robots.txt'))
    
    case response
      when Net::HTTPSuccess then
      @found = true
      @body = response.body
      parse()    
      
      else
      @found = false
    end 
    
    return @found
    
    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET => e
    if @ehttp
      @ettp = false
      retry 
      else
      return nil
    end
  end
  
end