Class: Rspider::MysqlUrlStorage

Inherits:
Object
  • Object
show all
Defined in:
lib/rspider/MysqlUrlStorage.rb

Overview

The class MysqlUrlStorage store urls in Mysql database For better performance, we create an UrlStorage object to cache urls in memory

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hash, source = "default") ⇒ MysqlUrlStorage

Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass Param source is the name of cralwering task

Raises:



26
27
28
29
30
31
# File 'lib/rspider/MysqlUrlStorage.rb', line 26

def initialize(hash,source="default")
	@source=source
	@my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"])
	raise MysqlException if @my.nil?
	@cache=UrlStorageCache.new
end

Instance Attribute Details

#cacheObject

Returns the value of attribute cache.



23
24
25
# File 'lib/rspider/MysqlUrlStorage.rb', line 23

def cache
  @cache
end

Instance Method Details

#<<(url) ⇒ Object

we discover a new url and record it



55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/rspider/MysqlUrlStorage.rb', line 55

def <<(url)
	return nil if @cache.urlStored?(url)	
	ukey=md5(url)+@source
	crc=Zlib::crc32(url,@seed)
	sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')";
	begin 
		@my.query(sql)
		@cache.<<(url)
	rescue Mysql::Error,StandardError,Exception => e
	else
	end
end

#closeObject



103
104
105
# File 'lib/rspider/MysqlUrlStorage.rb', line 103

def close
	@my.close
end

#error(url) ⇒ Object

we meet an error,so we log it



96
97
98
99
100
101
102
# File 'lib/rspider/MysqlUrlStorage.rb', line 96

def error(url)
	@cache.error(url)	
	ukey=md5(url)+@source
	crc=Zlib::crc32(url)
	sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1"
	@my.query(sql)
end

#md5(string) ⇒ Object

get the MD5 hash of string param “string”



33
34
35
36
37
# File 'lib/rspider/MysqlUrlStorage.rb', line 33

def md5(string)
	t=Digest::MD5.new 
	t << string
	t.to_s
end

#popObject

got a url to cralwer



68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/rspider/MysqlUrlStorage.rb', line 68

def pop()
	#sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"'  ORDER BY errors asc,score desc,RAND() LIMIT 1"
	url=@cache.pop
	return url unless url.nil?
	sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"'  ORDER BY RAND() LIMIT 1"
	begin 
		rs=@my.query(sql)
		rs.each do |r|
			return	r[0]
		end
	rescue Mysql::Error
		return nil
	end
end

#visited(url) ⇒ Object

we have cralwered an url ,so we recored it



83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/rspider/MysqlUrlStorage.rb', line 83

def visited(url)
	@cache.visited(url)
	ukey=md5(url)+@source
	crc=Zlib::crc32(url)
	sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND  `ukey`='"+ukey+"' LIMIT 1"
	begin 
		@my.query(sql)
	rescue Mysql::Error
		return false
	end
	return true
end

#visited?(url) ⇒ Boolean

asking if the url has been visited?

Returns:

  • (Boolean)


39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/rspider/MysqlUrlStorage.rb', line 39

def visited?(url)
	return true if @cache.visited?(url)
	ukey=md5(url)+@source
	crc=Zlib::crc32(url)
	sql="SELECT visited FROM `urls` WHERE  AND url_crc32=#{crc} AND `ukey`='"+ukey+"'  LIMIT 1"
	begin 
		rs=@my.query(sql)
		rs.each do |r|
			return true if r[0].to_i>0
		end
		return nil
	rescue Mysql::Error => e
		return nil
	end
end