Class: HTTPage
Instance Attribute Summary collapse
-
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
-
#encoding ⇒ Object
Returns page encoding.
-
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
-
#url ⇒ Object
Returns the value of attribute url.
Instance Method Summary collapse
-
#body ⇒ Object
Returns document body.
- #clean ⇒ Object
-
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
-
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
-
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
constructor
A new instance of HTTPage.
-
#reset_buffers ⇒ Object
Resets encoding and response buffers.
-
#response ⇒ Object
Fetches document using HTTP and returns response object.
-
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
Constructor Details
#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage
Returns a new instance of HTTPage.
14 15 16 17 18 19 20 21 22 |
# File 'lib/httpage/httpage.rb', line 14 def initialize(url,redir_retry=5,conn_retry=8,timeout=40) @encoding = nil @response = nil @http_req = nil @redir_retry = redir_retry @conn_retry = conn_retry @timeout = timeout self.url = url end |
Instance Attribute Details
#conn_retry ⇒ Object
Returns the value of attribute conn_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def conn_retry @conn_retry end |
#encoding ⇒ Object
Returns page encoding.
41 42 43 |
# File 'lib/httpage/httpage.rb', line 41 def encoding @encoding ||= get_page_encoding end |
#redir_retry ⇒ Object
Returns the value of attribute redir_retry.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def redir_retry @redir_retry end |
#timeout ⇒ Object
Returns the value of attribute timeout.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def timeout @timeout end |
#url ⇒ Object
Returns the value of attribute url.
11 12 13 |
# File 'lib/httpage/httpage.rb', line 11 def url @url end |
Instance Method Details
#body ⇒ Object
Returns document body.
149 150 151 152 |
# File 'lib/httpage/httpage.rb', line 149 def body r = self.response return r.respond_to?(:body) ? r.body : nil end |
#clean ⇒ Object
193 |
# File 'lib/httpage/httpage.rb', line 193 def clean; clean_text end |
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/httpage/httpage.rb', line 171 def clean_text(text=nil, enc=nil) text ||= self.body enc ||= self.encoding page = Iconv.iconv('UTF-8//IGNORE', enc, text).join page = strip_html(page) page.gsub!(/['`]/m, '_amp__') page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase page.tr!(".!?", ' ') page.gsub!(/[^\x00-\x7F]+/, '') page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '') page.gsub!('_amp__',"'") page.squeeze!(" \n") page.gsub!(/^\s?\n\s?$/m, '') page.gsub!(/\n\s/,"\n") page.gsub!(/\s\n/,"\n") page.gsub!(/^\s+/,'') page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3') page.gsub!(/(^|\s)\'+(\s|$)/, '') page.squeeze!("\n ") return page end |
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
197 198 199 200 201 |
# File 'lib/httpage/httpage.rb', line 197 def clean_words(text=nil, enc=nil) clean_text(text, enc). gsub(%r{[.*?]}mi, ' '). gsub(/[^a-z0-9]+/im, ' ') end |
#reset_buffers ⇒ Object
Resets encoding and response buffers.
26 27 28 29 |
# File 'lib/httpage/httpage.rb', line 26 def reset_buffers @encoding = nil @response = nil end |
#response ⇒ Object
Fetches document using HTTP and returns response object. It also sets encoding.
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/httpage/httpage.rb', line 97 def response return @response unless @response.nil? found = false response = nil url = @url http_req = @http_req redir_retry = @redir_retry conn_retry = @conn_retry until found do begin status = Timeout::timeout(@timeout) do case url.scheme.downcase.to_sym when :http response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) } when :https https = Net::HTTP.new(url.host, url.port) https.use_ssl = true https.verify_mode = OpenSSL::SSL::VERIFY_NONE response = https.start { |http| http.request(http_req) } else return nil end end response.value rescue Net::HTTPRetriableError conn_retry -= 1 if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty? url = URI.parse(response.header['location']) http_req = Net::HTTP::Get.new(url.path) redir_retry -= 1 end rescue return nil end if response.kind_of?(Net::HTTPOK) found = true break end break if (redir_retry < 0 || conn_retry < 0) end if found @response = response @encoding = get_page_encoding return response else return nil end end |
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/httpage/httpage.rb', line 156 def strip_html(text=nil) text ||= self.body coder=HTMLEntities.new coder.decode(text.tr("\t", ' '). tr("\r", ''). sub(%r{<body.*?>(.*?)</body>}mi, '\1'). gsub(%r{<script.*?>(.*?)</script>}mi, ' '). gsub(%r{<style.*?>(.*?)</style>}mi, ' '). gsub(%r{<!--.*?-->}mi, ' '). gsub(/<br\s*\/?>|<p>/mi, "\n"). gsub(/<.*?>/m, '')) end |