Class: HTTPage

Inherits:
Object
  • Object
show all
Includes:
BufferAffects
Defined in:
lib/httpage/httpage.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, redir_retry = 5, conn_retry = 8, timeout = 40) ⇒ HTTPage

Returns a new instance of HTTPage.



14
15
16
17
18
19
20
21
22
23
24
# File 'lib/httpage/httpage.rb', line 14

def initialize(url,redir_retry=5,conn_retry=8,timeout=40)
  @encoding     = nil
  @content_type = nil
  @response     = nil
  @http_req     = nil
  @redir_retry  = redir_retry
  @conn_retry   = conn_retry
  @timeout      = timeout
  @real_url     = nil
  self.url      = url
end

Instance Attribute Details

#conn_retryObject

Returns the value of attribute conn_retry.



11
12
13
# File 'lib/httpage/httpage.rb', line 11

def conn_retry
  @conn_retry
end

#encodingObject

Returns page encoding.



43
44
45
46
# File 'lib/httpage/httpage.rb', line 43

def encoding
  @encoding, @content_type = get_page_info if @encoding.nil?
  return @encoding
end

#redir_retryObject

Returns the value of attribute redir_retry.



11
12
13
# File 'lib/httpage/httpage.rb', line 11

def redir_retry
  @redir_retry
end

#timeoutObject

Returns the value of attribute timeout.



11
12
13
# File 'lib/httpage/httpage.rb', line 11

def timeout
  @timeout
end

#urlObject

Returns the value of attribute url.



11
12
13
# File 'lib/httpage/httpage.rb', line 11

def url
  @url
end

Instance Method Details

#bodyObject

Returns document body.



179
180
181
182
# File 'lib/httpage/httpage.rb', line 179

def body
  r = self.response
  return r.respond_to?(:body) ? r.body : nil
end

#cleanObject



228
# File 'lib/httpage/httpage.rb', line 228

def clean; clean_text end

#clean_text(text = nil, enc = nil) ⇒ Object

Transliterates text to ASCII and removes unknown characters.



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/httpage/httpage.rb', line 206

def clean_text(text=nil, enc=nil)
  text ||= self.body
  enc ||= self.encoding
  page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
  page = strip_html(page)
  page.gsub!(/['`]/m, '_amp__')
  page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase
  page.tr!(".!?", ' ')
  page.gsub!(/[^\x00-\x7F]+/, '')
  page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '')
  page.gsub!('_amp__',"'")
  page.squeeze!(" \n")
  page.gsub!(/^\s?\n\s?$/m, '')
  page.gsub!(/\n\s/,"\n")
  page.gsub!(/\s\n/,"\n")
  page.gsub!(/^\s+/,'')
  page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3')
  page.gsub!(/(^|\s)\'+(\s|$)/, '')
  page.squeeze!("\n ")
  return page
end

#clean_words(text = nil, enc = nil) ⇒ Object

Transliterates text to ASCII and removes unknown characters leaving just words.



232
233
234
235
236
# File 'lib/httpage/httpage.rb', line 232

def clean_words(text=nil, enc=nil)
  clean_text(text, enc).
  gsub(%r{[.*?]}mi, ' ').
  gsub(/[^a-z0-9]+/im, ' ')
end

#content_typeObject

Returns page content-type.



50
51
52
53
# File 'lib/httpage/httpage.rb', line 50

def content_type
  @encoding, @content_type = get_page_info if @content_type.nil?
  return @content_type
end

#real_urlObject



184
185
186
187
# File 'lib/httpage/httpage.rb', line 184

def real_url
  return nil if self.response.nil?
  return @real_url
end

#reset_buffersObject

Resets encoding and response buffers.



28
29
30
31
# File 'lib/httpage/httpage.rb', line 28

def reset_buffers
  @encoding = nil
  @response = nil
end

#responseObject

Fetches document using HTTP and returns response object. It also sets encoding.



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/httpage/httpage.rb', line 125

def response
  return @response unless @response.nil?
  url         = @url
  found       = false
  response    = nil
  @real_url   = nil
  http_req    = @http_req
  redir_retry = @redir_retry
  conn_retry  = @conn_retry
  
  until found do
    begin
      status = Timeout::timeout(@timeout) do
        case url.scheme.downcase.to_sym
        when :http
          response = Net::HTTP.start(url.host, url.port) { |http|  http.request(http_req) }
        when :https
          https             = Net::HTTP.new(url.host, url.port)
          https.use_ssl     = true
          https.verify_mode = OpenSSL::SSL::VERIFY_NONE
          response = https.start { |http| http.request(http_req) }
        else
          return nil
        end
      end
      response.value
    rescue Net::HTTPRetriableError
      conn_retry -= 1
      if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
        url = URI.parse(response.header['location'])
        http_req = Net::HTTP::Get.new(url.path)
        redir_retry -= 1
      end
    rescue
      return nil
    end
    if response.kind_of?(Net::HTTPOK)
      found = true
      break
    end
    break if (redir_retry < 0 || conn_retry < 0)
  end
  if found
    @real_url = url
    @response = response
    @encoding, @content_type = get_page_info
    return response
  else
    return nil
  end
end

#strip_html(text = nil) ⇒ Object

Strips HTML tags from document.



191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/httpage/httpage.rb', line 191

def strip_html(text=nil)
  text ||= self.body
  coder=HTMLEntities.new
  coder.decode(text.tr("\t", ' ').
               tr("\r", '').
               sub(%r{<body.*?>(.*?)</body>}mi, '\1').
               gsub(%r{<script.*?>(.*?)</script>}mi, ' ').
               gsub(%r{<style.*?>(.*?)</style>}mi, ' ').
               gsub(%r{<!--.*?-->}mi, ' ').
               gsub(/<br\s*\/?>|<p>/mi, "\n").
               gsub(/<.*?>/m, ''))
end