Class: Rspider::HtmlTidy

Inherits:
Object
  • Object
show all
Defined in:
lib/rspider/HtmlTidy.rb

Overview

this class guesses the encoding of a html document,convert it from various encoding to UTF-8

Instance Method Summary collapse

Constructor Details

#initializeHtmlTidy

Returns a new instance of HtmlTidy.



12
13
14
# File 'lib/rspider/HtmlTidy.rb', line 12

def initialize()
	#@defaultCharset="UTF-8"
end

Instance Method Details

#guess_encoding(resp) ⇒ Object

guess the encoding of html document



16
17
18
# File 'lib/rspider/HtmlTidy.rb', line 16

def guess_encoding(resp)
	resp[0..400].scan(/content="(.*)"/i).flatten.join("\t").scan(/charset=([a-z0-9\-]+)/i).flatten.join("\t")
end

#iconv(from, to, text) ⇒ Object

convert document from one encoding to another



20
21
22
# File 'lib/rspider/HtmlTidy.rb', line 20

def iconv(from,to,text)
	Iconv.new(to.upcase+"//IGNORE",from.upcase+"//IGNORE").iconv(text)
end

#strip_tagsObject



31
32
# File 'lib/rspider/HtmlTidy.rb', line 31

def strip_tags()
end

#tidy(html) ⇒ Object

guess the encoding of the document and convert it to UTF-8



24
25
26
27
28
29
30
# File 'lib/rspider/HtmlTidy.rb', line 24

def tidy(html)
	encoding = guess_encoding(html).upcase
	return html if encoding == "UTF-8"
	encoding="GBK" if encoding =~ /gb2312/
	encoding="GBK" if encoding =~ /gbk/
	iconv(encoding,"UTF-8",html)
end