Class: Rspider::HTMLDocument

Inherits:

Hash

Object
Hash
Rspider::HTMLDocument

show all

Defined in:: lib/rspider/Document.rb

Instance Method Summary collapse

#get_head_data(html) ⇒ Object

得到head区的内容.
#get_headlines(html) ⇒ Object

得到头条之类的大纲.

Instance Method Details

#get_head_data(html) ⇒ `Object`

得到head区的内容

# File 'lib/rspider/Document.rb', line 17

def get_head_data(html)
	r=Regexp.compile('<head[^>]*>(.*?)<\/head>',Regexp::IGNORECASE|Regexp::MULTILINE)
	m=html.scan(r)
	return nil if m.nil?
	return nil if m[0].nil?
	head={}
	head[:title]=""
	head[:keywords]=""
	head[:robots]=""
	head[:description]=""
	head[:nofollow]=false
	head[:noindex]=false
	head[:base]=""

	
	h=m[0][0]
	begin
		r_robots=/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
		robots=h.scan(r_robots)[0][0]
		head[:robots]=robots
	rescue
	end	
	begin
		r_desc=/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
		head[:description]=h.scan(r_desc)[0][0]
	rescue
	end	
	begin
		r_keys=/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
		head[:keywords]=h.scan(r_keys)[0][0]
	rescue
	end

	begin
		r_charset=/<meta +http\-equiv*=[\"']?Content-Type[\"']? *content=[\"']?([^<>'\"]+)[\"']?/im
		head[:charset]=h.scan(r_charset)[0][0].split("=").pop
	rescue
	end

	begin
		r_base=/<base +href *= *[\"']?([^<>'\"]+)[\"']?/im
		head[:base]=h.scan(r_base)[0][0]
	rescue
	end
	begin
		r_title=/<title *>(.*?)<\/title*>/im
		head[:title]=h.scan(r_title)[0][0].gsub("\n","")
	rescue
	end

	begin
		archives=[]
		r_archives=/<link +rel*=[\"']?archives[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
		h.scan(r_archives).each{ |l|
			archives.push l[0]
		}
		head[:archives]=archives
	rescue
	end
	begin
		links=[]
		r_alternates=/<link +rel*=[\"']?alternate[\"']? *[^\>]+href=[\"']?([^<>'\"]+)[\"']?/im
		h.scan(r_alternates).each{ |l|
			links.push l[0]
		}
		head[:rss_links]=links
	rescue
	end
	robots.downcase.split(",").each{ |j|
		head[:noindex]=true if j=="noindex"
		head[:nofollow]=true if j=="nofollow"
	}	unless robots.nil?
	head
end

#get_headlines(html) ⇒ `Object`

得到头条之类的大纲

# File 'lib/rspider/Document.rb', line 6

def get_headlines(html)
	r=Regexp.compile('<h[0-9][^>]*>(.*?)<\/h[0-9]>',Regexp::IGNORECASE|Regexp::MULTILINE)
	lines=[]
	ms=html.scan(r)
	return nil if ms.nil?
	ms.each{|m|
		lines.push m[0]	
	}
	lines.join("\n")
end

Class: Rspider::HTMLDocument

Instance Method Summary collapse

Instance Method Details

#get_head_data(html) ⇒ Object

#get_headlines(html) ⇒ Object

#get_head_data(html) ⇒ `Object`

#get_headlines(html) ⇒ `Object`