Class: HTSucker
- Inherits:
-
Object
- Object
- HTSucker
- Includes:
- BufferAffects, DomainsToLanguages
- Defined in:
- lib/htsucker/htsucker.rb
Constant Summary collapse
- DefaultOpts =
Default options are matrix for defaults used by class method HTSucker.default_options while setting up class variable @@default_options which is used by instances as a matrix for options not given when creating new objects.
{ :redir_retry => 5, :conn_retry => 8, :total_retry => 2, :read_timeout => 15, :total_timeout => 30, :allow_strange_ports => false, :max_length => 524288 }.freeze
Instance Attribute Summary collapse
-
#url ⇒ Object
Returns the value of attribute url.
Class Method Summary collapse
-
.default_options(opts = nil) ⇒ Object
Use this class method to set up default options used when creating new objects.
Instance Method Summary collapse
-
#body ⇒ Object
Returns document body.
-
#charset ⇒ Object
Returns page charset.
- #clean ⇒ Object
-
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
-
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
- #content_charset ⇒ Object
- #content_charset=(x) ⇒ Object
-
#content_language(default_content_lanuage = 'en') ⇒ Object
Returns content-language or default content language.
-
#content_type ⇒ Object
Returns page content-type.
-
#content_type_major ⇒ Object
Returns major name of the content-type or nil if something went wrong.
-
#content_type_minor ⇒ Object
Returns minor name of the content-type or nil if something went wrong.
-
#domain ⇒ Object
Returns top-level domain for URL.
-
#fetch(*args) ⇒ Object
Alias for body.
-
#host ⇒ Object
Returns hostname.
-
#initialize(url, options = nil) ⇒ HTSucker
constructor
Creates new instance of HTSucker.
- #lang ⇒ Object
- #language ⇒ Object
-
#path ⇒ Object
Returns resource path.
-
#port ⇒ Object
Returns used port.
-
#protocol ⇒ Object
Returns protocol.
-
#real_domain ⇒ Object
Returns top-level domain for real URL.
-
#real_host ⇒ Object
Returns real hostname.
-
#real_path ⇒ Object
Returns real resource path.
-
#real_port ⇒ Object
Returns real port.
-
#real_protocol ⇒ Object
Returns real protocol.
-
#real_url ⇒ Object
Returns URL used while obtaining content (e.g. after redirection).
-
#reset_buffers ⇒ Object
Resets charset and response buffers.
-
#response ⇒ Object
Fetches document using HTTP and returns response object.
-
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
-
#words ⇒ Object
Transliterates text to ASCII, removes unknown characters and returns array of words.
Constructor Details
#initialize(url, options = nil) ⇒ HTSucker
Creates new instance of HTSucker. url
parameter should be valid URI object or string. You may want to override defaults by issuing hash containing options you want to be changed.
29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/htsucker/htsucker.rb', line 29 def initialize(url, =nil) = self.class..dup if .respond_to?(:keys) unknown = (.keys - .keys).join(', ') raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty? .merge!() end .each_pair do |opt_name,opt_value| instance_variable_set("@#{opt_name}", opt_value) end reset_buffers @http_req = nil self.url = url end |
Instance Attribute Details
#url ⇒ Object
Returns the value of attribute url.
12 13 14 |
# File 'lib/htsucker/htsucker.rb', line 12 def url @url end |
Class Method Details
.default_options(opts = nil) ⇒ Object
Use this class method to set up default options used when creating new objects. For each option that you omit it will be taken from constant hash called DefaultOpts. Default options hash is stored in @@default_options. This method will return current default options when called without parameter.
452 453 454 455 456 457 458 459 460 461 462 463 464 465 |
# File 'lib/htsucker/htsucker.rb', line 452 def self.(opts=nil) @@default_options ||= DefaultOpts.dup return @@default_options.freeze if opts.nil? if opts.respond_to?(:keys) known_opts = DefaultOpts.keys unknown = (opts.keys - known_opts).join(', ') raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty? @@default_options.unfreeze @@default_options.merge!(opts) return @@default_options.freeze else raise ArgumentError.new("malformed options") end end |
Instance Method Details
#body ⇒ Object
Returns document body.
371 372 373 374 |
# File 'lib/htsucker/htsucker.rb', line 371 def body r = self.response return r.respond_to?(:body) ? r.body : nil end |
#charset ⇒ Object
Returns page charset.
104 105 106 107 |
# File 'lib/htsucker/htsucker.rb', line 104 def charset @content_type, @charset = get_page_info if @charset.nil? return @charset end |
#clean ⇒ Object
428 |
# File 'lib/htsucker/htsucker.rb', line 428 def clean; clean_text end |
#clean_text(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters.
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 |
# File 'lib/htsucker/htsucker.rb', line 405 def clean_text(text=nil, enc=nil) text ||= self.body enc ||= self.charset @transliterator ||= Iconv.new('ASCII//TRANSLIT//IGNORE', 'UTF-8') page = Iconv.iconv('UTF-8//IGNORE', enc, text).join page = strip_html(page) page.gsub!(/['`]/m, '_amp__') page = @transliterator.conv(page).downcase page.tr!(".!?", ' ') page.gsub!(/[^\x00-\x7F]+/, '') page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '') page.gsub!('_amp__',"'") page.squeeze!(" \n") page.gsub!(/^\s?\n\s?$/m, '') page.gsub!(/\n\s/,"\n") page.gsub!(/\s\n/,"\n") page.gsub!(/^\s+/,'') page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3') page.gsub!(/(^|\s)\'+(\s|$)/, '') page.squeeze!("\n ") return page end |
#clean_words(text = nil, enc = nil) ⇒ Object
Transliterates text to ASCII and removes unknown characters leaving just words.
432 433 434 435 436 437 438 439 |
# File 'lib/htsucker/htsucker.rb', line 432 def clean_words(text=nil, enc=nil) cw = clean_text(text, enc) cw.gsub!(/\[\s*?[^\:]+?\:\/+?.*?\]/mi, ' ') cw.gsub!(/\[\s*?(\d|\s|[^\w])+\]/mi, ' ') cw.gsub!(/[^a-z0-9]+/im, ' ') cw.squeeze!(' ') return cw end |
#content_charset ⇒ Object
109 |
# File 'lib/htsucker/htsucker.rb', line 109 def content_charset; charset end |
#content_charset=(x) ⇒ Object
110 |
# File 'lib/htsucker/htsucker.rb', line 110 def content_charset=(x) charset=(x) end |
#content_language(default_content_lanuage = 'en') ⇒ Object
Returns content-language or default content language.
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/htsucker/htsucker.rb', line 169 def content_language(default_content_lanuage='en') clang = nil if self.response.nil? clang = domain_to_spoken return default_content_lanuage end # try meta-tag header unless self.body.to_s.empty? || self.content_type_major != :text header = body.scan(/<meta http-equiv\s*=\s*['"]*content-language['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i) header = header.flatten.first clang = extract_content_language(header) end # try lang and xml:lang attribute from HTML tag and do the same for body tag if clang.to_s.empty? && !self.body.to_s.empty? && self.content_type_major == :text header = body.scan(/<x?html\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i) header = header.flatten.first if header.to_s.empty? header = body.scan(/<x?html\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i) header = header.flatten.first end if header.to_s.empty? header = body.scan(/<body\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i) header = header.flatten.first end if header.to_s.empty? header = body.scan(/<body\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i) header = header.flatten.first end clang = extract_content_language(header) end # try server header and in case of 'en' or empty try to figure language by looking at top-domain if clang.to_s.empty? && response.respond_to?(:header) header = response.header['content-language'] clang = extract_content_language(header) present = clang.to_s clang = domain_to_spoken if (present.empty? || present[0..1] == 'en') clang = present if (clang.to_s.empty? && !present.empty?) end # try default clang = default_content_lanuage if clang.to_s.empty? return clang end |
#content_type ⇒ Object
Returns page content-type.
114 115 116 117 |
# File 'lib/htsucker/htsucker.rb', line 114 def content_type @content_type, @charset = get_page_info if @content_type.nil? return @content_type end |
#content_type_major ⇒ Object
Returns major name of the content-type or nil if something went wrong.
121 122 123 124 125 126 127 |
# File 'lib/htsucker/htsucker.rb', line 121 def content_type_major ctype = self.content_type.to_s return nil if ctype.empty? ctype = ctype.split('/').first return nil if ctype.to_s.empty? return ctype.to_sym end |
#content_type_minor ⇒ Object
Returns minor name of the content-type or nil if something went wrong.
131 132 133 134 135 136 137 |
# File 'lib/htsucker/htsucker.rb', line 131 def content_type_minor ctype = self.content_type.to_s return nil if ctype.empty? ctype = ctype.split('/')[1] return nil if ctype.to_s.empty? return ctype.to_sym end |
#domain ⇒ Object
Returns top-level domain for URL.
68 69 70 |
# File 'lib/htsucker/htsucker.rb', line 68 def domain self.url.host.split('.').last.downcase.to_sym end |
#fetch(*args) ⇒ Object
Alias for body.
378 |
# File 'lib/htsucker/htsucker.rb', line 378 def fetch(*args); body(*args) end |
#host ⇒ Object
Returns hostname.
85 |
# File 'lib/htsucker/htsucker.rb', line 85 def host; url.host end |
#lang ⇒ Object
219 |
# File 'lib/htsucker/htsucker.rb', line 219 def lang; content_language end |
#language ⇒ Object
218 |
# File 'lib/htsucker/htsucker.rb', line 218 def language; content_language end |
#path ⇒ Object
Returns resource path.
79 |
# File 'lib/htsucker/htsucker.rb', line 79 def path; url.path end |
#port ⇒ Object
Returns used port.
91 |
# File 'lib/htsucker/htsucker.rb', line 91 def port; url.port end |
#protocol ⇒ Object
Returns protocol.
97 |
# File 'lib/htsucker/htsucker.rb', line 97 def protocol; url.class.name.split('::').last.downcase.to_sym end |
#real_domain ⇒ Object
Returns top-level domain for real URL.
74 75 76 |
# File 'lib/htsucker/htsucker.rb', line 74 def real_domain self.real_url.host.split('.').last.downcase.to_sym end |
#real_host ⇒ Object
Returns real hostname.
88 |
# File 'lib/htsucker/htsucker.rb', line 88 def real_host; real_url.host end |
#real_path ⇒ Object
Returns real resource path.
82 |
# File 'lib/htsucker/htsucker.rb', line 82 def real_path; real_url.path end |
#real_port ⇒ Object
Returns real port.
94 |
# File 'lib/htsucker/htsucker.rb', line 94 def real_port; real_url.port end |
#real_protocol ⇒ Object
Returns real protocol.
100 |
# File 'lib/htsucker/htsucker.rb', line 100 def real_protocol; real_url.class.name.split('::').last.downcase.to_sym end |
#real_url ⇒ Object
Returns URL used while obtaining content (e.g. after redirection).
382 383 384 385 |
# File 'lib/htsucker/htsucker.rb', line 382 def real_url return nil if self.response.nil? return @real_url end |
#reset_buffers ⇒ Object
Resets charset and response buffers.
46 47 48 49 50 51 52 |
# File 'lib/htsucker/htsucker.rb', line 46 def reset_buffers @charset = nil @content_type = nil @response = nil @overflow = 0 @real_url = nil end |
#response ⇒ Object
Fetches document using HTTP and returns response object. It also sets charset.
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
# File 'lib/htsucker/htsucker.rb', line 308 def response return @response unless @response.nil? url = @url found = false response = nil @real_url = nil http_req = @http_req redir_retry = @redir_retry conn_retry = @conn_retry until found do begin status = Timeout::timeout(@timeout) do case url.scheme.downcase.to_sym when :http response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) } when :https https = Net::HTTP.new(url.host, url.port) https.use_ssl = true https.verify_mode = OpenSSL::SSL::VERIFY_NONE response = https.start { |http| http.request(http_req) } else return nil end end response.value rescue Net::HTTPRetriableError conn_retry -= 1 if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty? url = URI.parse(response.header['location']) validate_url(url) http_req = Net::HTTP::Head.new(url.path) redir_retry -= 1 end rescue return nil end if response.kind_of?(Net::HTTPOK) found = true break end break if (redir_retry < 0 || conn_retry < 0) end if found @real_url = url @response = response @content_length = response.header['content-length'].to_s.to_i if @content_length > @max_length raise HTSuckerContentTooBig.new("content length (#{@content_length}) is greater than declared limit (#{@max_length})") end openuri_opts = { :redirect=>false, :read_timeout=>false } resource = open(@real_url.to_s, openuri_opts) resource.read(@max_length) @content_type, @charset = get_page_info(nil,nil) # using just server headers return response else return nil end end |
#strip_html(text = nil) ⇒ Object
Strips HTML tags from document.
389 390 391 392 393 394 395 396 397 398 399 400 401 |
# File 'lib/htsucker/htsucker.rb', line 389 def strip_html(text=nil) text ||= self.body @coder ||= HTMLEntities.new r = text.tr("\t", ' ') r.tr!("\r", '') r.sub!(%r{<body.*?>(.*?)</body>}mi, '\1') r.gsub!(%r{<script.*?>(.*?)</script>}mi, ' ') r.gsub!(%r{<style.*?>(.*?)</style>}mi, ' ') r.gsub!(%r{<!--.*?-->}mi, ' ') r.gsub!(/<br\s*\/?>|<p>/mi, "\n") r.gsub!(/<.*?>/m, '') return coder.decode(r) end |
#words ⇒ Object
Transliterates text to ASCII, removes unknown characters and returns array of words.
443 444 445 |
# File 'lib/htsucker/htsucker.rb', line 443 def words self.clean_words.split(' ') end |