Class: HTSucker

Inherits:
Object
  • Object
show all
Includes:
BufferAffects, DomainsToLanguages
Defined in:
lib/htsucker/htsucker.rb

Constant Summary collapse

DefaultOpts =

Default options are matrix for defaults used by class method HTSucker.default_options while setting up class variable @@default_options which is used by instances as a matrix for options not given when creating new objects.

{ :redir_retry          => 5,
:conn_retry           => 8,
:total_retry          => 2,
:read_timeout         => 15,
:total_timeout        => 30,
:allow_strange_ports  => false,
:max_length           => 524288 }.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = nil) ⇒ HTSucker

Creates new instance of HTSucker. url parameter should be valid URI object or string. You may want to override defaults by issuing hash containing options you want to be changed.



29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/htsucker/htsucker.rb', line 29

def initialize(url, options=nil)
  default_options = self.class.default_options.dup
  if options.respond_to?(:keys)
    unknown = (options.keys - default_options.keys).join(', ')
    raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
    default_options.merge!(options)
  end
  default_options.each_pair do |opt_name,opt_value|
    instance_variable_set("@#{opt_name}", opt_value)
  end
  reset_buffers
  @http_req = nil
  self.url  = url
end

Instance Attribute Details

#urlObject

Returns the value of attribute url.



12
13
14
# File 'lib/htsucker/htsucker.rb', line 12

def url
  @url
end

Class Method Details

.default_options(opts = nil) ⇒ Object

Use this class method to set up default options used when creating new objects. For each option that you omit it will be taken from constant hash called DefaultOpts. Default options hash is stored in @@default_options. This method will return current default options when called without parameter.



452
453
454
455
456
457
458
459
460
461
462
463
464
465
# File 'lib/htsucker/htsucker.rb', line 452

def self.default_options(opts=nil)
  @@default_options ||= DefaultOpts.dup
  return @@default_options.freeze if opts.nil?
  if opts.respond_to?(:keys)
    known_opts = DefaultOpts.keys
    unknown = (opts.keys - known_opts).join(', ')
    raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
    @@default_options.unfreeze
    @@default_options.merge!(opts)
    return @@default_options.freeze
  else
    raise ArgumentError.new("malformed options")
  end
end

Instance Method Details

#bodyObject

Returns document body.



371
372
373
374
# File 'lib/htsucker/htsucker.rb', line 371

def body
  r = self.response
  return r.respond_to?(:body) ? r.body : nil
end

#charsetObject

Returns page charset.



104
105
106
107
# File 'lib/htsucker/htsucker.rb', line 104

def charset
  @content_type, @charset = get_page_info if @charset.nil?
  return @charset
end

#cleanObject



428
# File 'lib/htsucker/htsucker.rb', line 428

def clean; clean_text end

#clean_text(text = nil, enc = nil) ⇒ Object

Transliterates text to ASCII and removes unknown characters.



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
# File 'lib/htsucker/htsucker.rb', line 405

def clean_text(text=nil, enc=nil)
  text            ||= self.body
  enc             ||= self.charset
  @transliterator ||= Iconv.new('ASCII//TRANSLIT//IGNORE', 'UTF-8')
  page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
  page = strip_html(page)
  page.gsub!(/['`]/m, '_amp__')
  page = @transliterator.conv(page).downcase
  page.tr!(".!?", ' ')
  page.gsub!(/[^\x00-\x7F]+/, '')
  page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '')
  page.gsub!('_amp__',"'")
  page.squeeze!(" \n")
  page.gsub!(/^\s?\n\s?$/m, '')
  page.gsub!(/\n\s/,"\n")
  page.gsub!(/\s\n/,"\n")
  page.gsub!(/^\s+/,'')
  page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3')
  page.gsub!(/(^|\s)\'+(\s|$)/, '')
  page.squeeze!("\n ")
  return page
end

#clean_words(text = nil, enc = nil) ⇒ Object

Transliterates text to ASCII and removes unknown characters leaving just words.



432
433
434
435
436
437
438
439
# File 'lib/htsucker/htsucker.rb', line 432

def clean_words(text=nil, enc=nil)
  cw = clean_text(text, enc)
  cw.gsub!(/\[\s*?[^\:]+?\:\/+?.*?\]/mi, ' ')
  cw.gsub!(/\[\s*?(\d|\s|[^\w])+\]/mi, ' ')
  cw.gsub!(/[^a-z0-9]+/im, ' ')
  cw.squeeze!(' ')
  return cw
end

#content_charsetObject



109
# File 'lib/htsucker/htsucker.rb', line 109

def content_charset;     charset      end

#content_charset=(x) ⇒ Object



110
# File 'lib/htsucker/htsucker.rb', line 110

def content_charset=(x)  charset=(x)  end

#content_language(default_content_lanuage = 'en') ⇒ Object

Returns content-language or default content language.



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/htsucker/htsucker.rb', line 169

def content_language(default_content_lanuage='en')
  clang = nil
  
  if self.response.nil?
    clang = domain_to_spoken
    return default_content_lanuage
  end
  
  # try meta-tag header
  unless self.body.to_s.empty? || self.content_type_major != :text
    header  = body.scan(/<meta http-equiv\s*=\s*['"]*content-language['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
    header  = header.flatten.first
    clang   = extract_content_language(header)
  end
  
  # try lang and xml:lang attribute from HTML tag and do the same for body tag
  if clang.to_s.empty? && !self.body.to_s.empty? && self.content_type_major == :text
    header  = body.scan(/<x?html\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
    header  = header.flatten.first
    if header.to_s.empty?
      header  = body.scan(/<x?html\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
      header  = header.flatten.first
    end
    if header.to_s.empty?
      header  = body.scan(/<body\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
      header  = header.flatten.first
    end
    if header.to_s.empty?
      header  = body.scan(/<body\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
      header  = header.flatten.first
    end
    clang = extract_content_language(header)
  end

  # try server header and in case of 'en' or empty try to figure language by looking at top-domain
  if clang.to_s.empty? && response.respond_to?(:header)
    header  = response.header['content-language']
    clang   = extract_content_language(header)
    present = clang.to_s
    clang   = domain_to_spoken if (present.empty? || present[0..1] == 'en')
    clang   = present if (clang.to_s.empty? && !present.empty?)
  end
  
  # try default
  clang = default_content_lanuage if clang.to_s.empty?
  
  return clang
end

#content_typeObject

Returns page content-type.



114
115
116
117
# File 'lib/htsucker/htsucker.rb', line 114

def content_type
  @content_type, @charset = get_page_info if @content_type.nil?
  return @content_type
end

#content_type_majorObject

Returns major name of the content-type or nil if something went wrong.



121
122
123
124
125
126
127
# File 'lib/htsucker/htsucker.rb', line 121

def content_type_major
  ctype = self.content_type.to_s
  return nil if ctype.empty?
  ctype = ctype.split('/').first
  return nil if ctype.to_s.empty?
  return ctype.to_sym
end

#content_type_minorObject

Returns minor name of the content-type or nil if something went wrong.



131
132
133
134
135
136
137
# File 'lib/htsucker/htsucker.rb', line 131

def content_type_minor
  ctype = self.content_type.to_s
  return nil if ctype.empty?
  ctype = ctype.split('/')[1]
  return nil if ctype.to_s.empty?
  return ctype.to_sym
end

#domainObject

Returns top-level domain for URL.



68
69
70
# File 'lib/htsucker/htsucker.rb', line 68

def domain
  self.url.host.split('.').last.downcase.to_sym
end

#fetch(*args) ⇒ Object

Alias for body.



378
# File 'lib/htsucker/htsucker.rb', line 378

def fetch(*args); body(*args) end

#hostObject

Returns hostname.



85
# File 'lib/htsucker/htsucker.rb', line 85

def host; url.host end

#langObject



219
# File 'lib/htsucker/htsucker.rb', line 219

def lang;     content_language end

#languageObject



218
# File 'lib/htsucker/htsucker.rb', line 218

def language; content_language end

#pathObject

Returns resource path.



79
# File 'lib/htsucker/htsucker.rb', line 79

def path; url.path end

#portObject

Returns used port.



91
# File 'lib/htsucker/htsucker.rb', line 91

def port; url.port end

#protocolObject

Returns protocol.



97
# File 'lib/htsucker/htsucker.rb', line 97

def protocol; url.class.name.split('::').last.downcase.to_sym end

#real_domainObject

Returns top-level domain for real URL.



74
75
76
# File 'lib/htsucker/htsucker.rb', line 74

def real_domain
  self.real_url.host.split('.').last.downcase.to_sym
end

#real_hostObject

Returns real hostname.



88
# File 'lib/htsucker/htsucker.rb', line 88

def real_host; real_url.host end

#real_pathObject

Returns real resource path.



82
# File 'lib/htsucker/htsucker.rb', line 82

def real_path; real_url.path end

#real_portObject

Returns real port.



94
# File 'lib/htsucker/htsucker.rb', line 94

def real_port; real_url.port end

#real_protocolObject

Returns real protocol.



100
# File 'lib/htsucker/htsucker.rb', line 100

def real_protocol; real_url.class.name.split('::').last.downcase.to_sym end

#real_urlObject

Returns URL used while obtaining content (e.g. after redirection).



382
383
384
385
# File 'lib/htsucker/htsucker.rb', line 382

def real_url
  return nil if self.response.nil?
  return @real_url
end

#reset_buffersObject

Resets charset and response buffers.



46
47
48
49
50
51
52
# File 'lib/htsucker/htsucker.rb', line 46

def reset_buffers
  @charset      = nil
  @content_type = nil
  @response     = nil
  @overflow     = 0
  @real_url     = nil
end

#responseObject

Fetches document using HTTP and returns response object. It also sets charset.



308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/htsucker/htsucker.rb', line 308

def response
  return @response unless @response.nil?
  url         = @url
  found       = false
  response    = nil
  @real_url   = nil
  http_req    = @http_req
  redir_retry = @redir_retry
  conn_retry  = @conn_retry
  
  until found do
    begin
      status = Timeout::timeout(@timeout) do
        case url.scheme.downcase.to_sym
        when :http
          response = Net::HTTP.start(url.host, url.port) { |http|  http.request(http_req) }
        when :https
          https             = Net::HTTP.new(url.host, url.port)
          https.use_ssl     = true
          https.verify_mode = OpenSSL::SSL::VERIFY_NONE
          response = https.start { |http| http.request(http_req) }
        else
          return nil
        end
      end
      response.value
    rescue Net::HTTPRetriableError
      conn_retry -= 1
      if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
        url = URI.parse(response.header['location'])
        validate_url(url)
        http_req = Net::HTTP::Head.new(url.path)
        redir_retry -= 1
      end
    rescue
      return nil
    end
    if response.kind_of?(Net::HTTPOK)
      found = true
      break
    end
    break if (redir_retry < 0 || conn_retry < 0)
  end
  if found
    @real_url = url
    @response = response
    @content_length = response.header['content-length'].to_s.to_i
    if @content_length > @max_length
      raise HTSuckerContentTooBig.new("content length (#{@content_length}) is greater than declared limit (#{@max_length})") 
    end
    openuri_opts = { :redirect=>false, :read_timeout=>false }
    resource = open(@real_url.to_s, openuri_opts)
    resource.read(@max_length)
    @content_type, @charset = get_page_info(nil,nil) # using just server headers
    
    return response
  else
    return nil
  end
end

#strip_html(text = nil) ⇒ Object

Strips HTML tags from document.



389
390
391
392
393
394
395
396
397
398
399
400
401
# File 'lib/htsucker/htsucker.rb', line 389

def strip_html(text=nil)
  text    ||= self.body
  @coder  ||= HTMLEntities.new
  r = text.tr("\t", ' ')
  r.tr!("\r", '')
  r.sub!(%r{<body.*?>(.*?)</body>}mi, '\1')
  r.gsub!(%r{<script.*?>(.*?)</script>}mi, ' ')
  r.gsub!(%r{<style.*?>(.*?)</style>}mi, ' ')
  r.gsub!(%r{<!--.*?-->}mi, ' ')
  r.gsub!(/<br\s*\/?>|<p>/mi, "\n")
  r.gsub!(/<.*?>/m, '')
  return coder.decode(r)
end

#wordsObject

Transliterates text to ASCII, removes unknown characters and returns array of words.



443
444
445
# File 'lib/htsucker/htsucker.rb', line 443

def words
  self.clean_words.split(' ')
end