Class: HTTPage
Instance Attribute Summary collapse
-
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
-
#encoding ⇒ Object
Returns page encoding.
-
#real_url ⇒ Object
readonly
Returns the value of attribute real_url.
-
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#url ⇒ Object
Returns the value of attribute url.
Instance Method Summary collapse
-
#body ⇒ Object
Returns document body.
- #clean ⇒ Object
-
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
-
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
-
#content_type ⇒ Object
Returns page content-type.
-
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
constructor
A new instance of HTTPage.
-
#reset_buffers ⇒ Object
Resets encoding and response buffers.
-
#response ⇒ Object
Fetches document using HTTP and returns response object.
-
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
Constructor Details
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
Returns a new instance of HTTPage.
15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/httpage/httpage.rb', line 15 def initialize(url,redir_retry=5,conn_retry=8,timeout=40) @encoding = nil @content_type = nil @response = nil @http_req = nil @redir_retry = redir_retry @conn_retry = conn_retry @timeout = timeout @real_url = nil self.url = url end |
Instance Attribute Details
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def conn_retry @conn_retry end |
#encoding ⇒ Object
Returns page encoding.
44 45 46 47 |
# File 'lib/httpage/httpage.rb', line 44 def encoding @encoding, @content_type = get_page_info if @encoding.nil? return @encoding end |
#real_url ⇒ Object (readonly)
Returns the value of attribute real_url.
12 13 14 |
# File 'lib/httpage/httpage.rb', line 12 def real_url @real_url end |
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def redir_retry @redir_retry end |
#timeout ⇒ Object
Returns the value of attribute timeout.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def timeout @timeout end |
#url ⇒ Object
Returns the value of attribute url.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def url @url end |
Instance Method Details
#body ⇒ Object
Returns document body.
176 177 178 179 |
# File 'lib/httpage/httpage.rb', line 176 def body r = self.response return r.respond_to?(:body) ? r.body : nil end |
#clean ⇒ Object
220 |
# File 'lib/httpage/httpage.rb', line 220 def clean; clean_text end |
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/httpage/httpage.rb', line 198 def clean_text(text=nil, enc=nil) text ||= self.body enc ||= self.encoding page = Iconv.iconv('UTF-8//IGNORE', enc, text).join page = strip_html(page) page.gsub!(/['`]/m, '_amp__') page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase page.tr!(".!?", ' ') page.gsub!(/[^\x00-\x7F]+/, '') page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '') page.gsub!('_amp__',"'") page.squeeze!(" \n") page.gsub!(/^\s?\n\s?$/m, '') page.gsub!(/\n\s/,"\n") page.gsub!(/\s\n/,"\n") page.gsub!(/^\s+/,'') page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3') page.gsub!(/(^|\s)\'+(\s|$)/, '') page.squeeze!("\n ") return page end |
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
224 225 226 227 228 |
# File 'lib/httpage/httpage.rb', line 224 def clean_words(text=nil, enc=nil) clean_text(text, enc). gsub(%r{[.*?]}mi, ' '). gsub(/[^a-z0-9]+/im, ' ') end |
#content_type ⇒ Object
Returns page content-type.
51 52 53 54 |
# File 'lib/httpage/httpage.rb', line 51 def content_type @encoding, @content_type = get_page_info if @content_type.nil? return @content_type end |
#reset_buffers ⇒ Object
Resets encoding and response buffers.
29 30 31 32 |
# File 'lib/httpage/httpage.rb', line 29 def reset_buffers @encoding = nil @response = nil end |
#response ⇒ Object
Fetches document using HTTP and returns response object. It also sets encoding.
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/httpage/httpage.rb', line 122 def response return @response unless @response.nil? url = @url found = false response = nil @real_url = nil http_req = @http_req redir_retry = @redir_retry conn_retry = @conn_retry until found do begin status = Timeout::timeout(@timeout) do case url.scheme.downcase.to_sym when :http response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) } when :https https = Net::HTTP.new(url.host, url.port) https.use_ssl = true https.verify_mode = OpenSSL::SSL::VERIFY_NONE response = https.start { |http| http.request(http_req) } else return nil end end response.value rescue Net::HTTPRetriableError conn_retry -= 1 if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty? url = URI.parse(response.header['location']) http_req = Net::HTTP::Get.new(url.path) redir_retry -= 1 end rescue return nil end if response.kind_of?(Net::HTTPOK) found = true break end break if (redir_retry < 0 || conn_retry < 0) end if found @real_url = url @response = response @encoding, @content_type = get_page_info return response else return nil end end |
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/httpage/httpage.rb', line 183 def strip_html(text=nil) text ||= self.body coder=HTMLEntities.new coder.decode(text.tr("\t", ' '). tr("\r", ''). sub(%r{<body.*?>(.*?)</body>}mi, '\1'). gsub(%r{<script.*?>(.*?)</script>}mi, ' '). gsub(%r{<style.*?>(.*?)</style>}mi, ' '). gsub(%r{<!--.*?-->}mi, ' '). gsub(/<br\s*\/?>|<p>/mi, "\n"). gsub(/<.*?>/m, '')) end |