Class: HTTPage
Instance Attribute Summary collapse
-
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
-
#encoding ⇒ Object
Returns page encoding.
-
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#url ⇒ Object
Returns the value of attribute url.
Instance Method Summary collapse
-
#body ⇒ Object
Returns document body.
- #clean ⇒ Object
-
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
-
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
-
#content_type ⇒ Object
Returns page content-type.
-
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
constructor
A new instance of HTTPage.
- #real_url ⇒ Object
-
#reset_buffers ⇒ Object
Resets encoding and response buffers.
-
#response ⇒ Object
Fetches document using HTTP and returns response object.
-
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
Constructor Details
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
Returns a new instance of HTTPage.
14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/httpage/httpage.rb', line 14 def initialize(url,redir_retry=5,conn_retry=8,timeout=40) @encoding = nil @content_type = nil @response = nil @http_req = nil @redir_retry = redir_retry @conn_retry = conn_retry @timeout = timeout @real_url = nil self.url = url end |
Instance Attribute Details
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def conn_retry @conn_retry end |
#encoding ⇒ Object
Returns page encoding.
43 44 45 46 |
# File 'lib/httpage/httpage.rb', line 43 def encoding @encoding, @content_type = get_page_info if @encoding.nil? return @encoding end |
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def redir_retry @redir_retry end |
#timeout ⇒ Object
Returns the value of attribute timeout.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def timeout @timeout end |
#url ⇒ Object
Returns the value of attribute url.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def url @url end |
Instance Method Details
#body ⇒ Object
Returns document body.
179 180 181 182 |
# File 'lib/httpage/httpage.rb', line 179 def body r = self.response return r.respond_to?(:body) ? r.body : nil end |
#clean ⇒ Object
228 |
# File 'lib/httpage/httpage.rb', line 228 def clean; clean_text end |
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
# File 'lib/httpage/httpage.rb', line 206 def clean_text(text=nil, enc=nil) text ||= self.body enc ||= self.encoding page = Iconv.iconv('UTF-8//IGNORE', enc, text).join page = strip_html(page) page.gsub!(/['`]/m, '_amp__') page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase page.tr!(".!?", ' ') page.gsub!(/[^\x00-\x7F]+/, '') page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '') page.gsub!('_amp__',"'") page.squeeze!(" \n") page.gsub!(/^\s?\n\s?$/m, '') page.gsub!(/\n\s/,"\n") page.gsub!(/\s\n/,"\n") page.gsub!(/^\s+/,'') page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3') page.gsub!(/(^|\s)\'+(\s|$)/, '') page.squeeze!("\n ") return page end |
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
232 233 234 235 236 |
# File 'lib/httpage/httpage.rb', line 232 def clean_words(text=nil, enc=nil) clean_text(text, enc). gsub(%r{[.*?]}mi, ' '). gsub(/[^a-z0-9]+/im, ' ') end |
#content_type ⇒ Object
Returns page content-type.
50 51 52 53 |
# File 'lib/httpage/httpage.rb', line 50 def content_type @encoding, @content_type = get_page_info if @content_type.nil? return @content_type end |
#real_url ⇒ Object
184 185 186 187 |
# File 'lib/httpage/httpage.rb', line 184 def real_url return nil if self.response.nil? return @real_url end |
#reset_buffers ⇒ Object
Resets encoding and response buffers.
28 29 30 31 |
# File 'lib/httpage/httpage.rb', line 28 def reset_buffers @encoding = nil @response = nil end |
#response ⇒ Object
Fetches document using HTTP and returns response object. It also sets encoding.
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/httpage/httpage.rb', line 125 def response return @response unless @response.nil? url = @url found = false response = nil @real_url = nil http_req = @http_req redir_retry = @redir_retry conn_retry = @conn_retry until found do begin status = Timeout::timeout(@timeout) do case url.scheme.downcase.to_sym when :http response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) } when :https https = Net::HTTP.new(url.host, url.port) https.use_ssl = true https.verify_mode = OpenSSL::SSL::VERIFY_NONE response = https.start { |http| http.request(http_req) } else return nil end end response.value rescue Net::HTTPRetriableError conn_retry -= 1 if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty? url = URI.parse(response.header['location']) http_req = Net::HTTP::Get.new(url.path) redir_retry -= 1 end rescue return nil end if response.kind_of?(Net::HTTPOK) found = true break end break if (redir_retry < 0 || conn_retry < 0) end if found @real_url = url @response = response @encoding, @content_type = get_page_info return response else return nil end end |
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/httpage/httpage.rb', line 191 def strip_html(text=nil) text ||= self.body coder=HTMLEntities.new coder.decode(text.tr("\t", ' '). tr("\r", ''). sub(%r{<body.*?>(.*?)</body>}mi, '\1'). gsub(%r{<script.*?>(.*?)</script>}mi, ' '). gsub(%r{<style.*?>(.*?)</style>}mi, ' '). gsub(%r{<!--.*?-->}mi, ' '). gsub(/<br\s*\/?>|<p>/mi, "\n"). gsub(/<.*?>/m, '')) end |