Class: RHACK::Page
Overview
Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, … ) ) => Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, …
Direct Known Subclasses
Constant Summary collapse
- @@ignore =
for johnson
/google|_gat|tracker|adver/i
Instance Attribute Summary collapse
-
#body ⇒ Object
(also: #html)
readonly
Returns the value of attribute body.
-
#curl ⇒ Object
readonly
Returns the value of attribute curl.
-
#curl_res ⇒ Object
readonly
Returns the value of attribute curl_res.
-
#data ⇒ Object
(also: #hash)
readonly
Returns the value of attribute data.
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#failed ⇒ Object
readonly
Returns the value of attribute failed.
-
#js ⇒ Object
readonly
Returns the value of attribute js.
-
#loc ⇒ Object
readonly
Returns the value of attribute loc.
-
#res ⇒ Object
result of page processing been made in frame context.
- #title(full = true) ⇒ Object
Instance Method Summary collapse
- #at(selector_or_node, options = {}) ⇒ Object (also: #first)
-
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
- #empty? ⇒ Boolean
- #eval_js(frame = nil) ⇒ Object
- #eval_string(str) ⇒ Object
-
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path.
-
#failed? ⇒ Boolean
override this in a subclass.
- #find(selector_or_nodes, options = {}, &foreach) ⇒ Object (also: #all)
- #flatten_dict(hash) ⇒ Object
-
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #.
- #get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object (also: #link, #get_href)
-
#get_links(links = 'a') ⇒ Object
(also: #get_hrefs, #links)
def get_src(link=‘img’) begin link = at(link) && at(link).src if link.is String rescue LibXML::XML::Error; nil end expand_link link if link end.
- #get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object (also: #src)
-
#get_srcs(links = 'img') ⇒ Object
(also: #srcs)
TODO: make into same form as #get_src and #map.
-
#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page
constructor
Frame calls it with no args.
- #inspect ⇒ Object
-
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’) begin link = at(link) && (at(link).href || at(link+‘//a’).href) if link.is String rescue XML::Error; nil end expand_link link if link end.
- #map(selector_or_nodes, options = {}, &mapper) ⇒ Object
- #map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
-
#parse(opts = {}) ⇒ Object
override this in a subclass MUST return self if successful MAY return false otherwise.
-
#process(c, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about value returned by #process.
-
#retry? ⇒ Boolean
override this in a subclass.
- #size ⇒ Object
- #submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
-
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #.
- #texts(hash, options = {}) ⇒ Object
- #to_html ⇒ Object
- #to_xml ⇒ Object
- #url ⇒ Object (also: #href)
- #utf! ⇒ Object
Constructor Details
#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page
Frame calls it with no args
46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/rhack/page.rb', line 46 def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) loc = loc.parse:uri if !loc.is Hash @js = js if obj.is Curl::Easy or obj.kinda Scout c = obj.kinda(Scout) ? obj.http : obj # just (c, loc) would pass to #process opts variable that returns '' on any key process(c, loc.b || {}) else @body = obj @loc = loc end end |
Instance Attribute Details
#body ⇒ Object (readonly) Also known as: html
Returns the value of attribute body.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def body @body end |
#curl ⇒ Object (readonly)
Returns the value of attribute curl.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def curl @curl end |
#curl_res ⇒ Object (readonly)
Returns the value of attribute curl_res.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def curl_res @curl_res end |
#data ⇒ Object (readonly) Also known as: hash
Returns the value of attribute data.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def data @data end |
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def doc @doc end |
#failed ⇒ Object (readonly)
Returns the value of attribute failed.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def failed @failed end |
#js ⇒ Object (readonly)
Returns the value of attribute js.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def js @js end |
#loc ⇒ Object (readonly)
Returns the value of attribute loc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def loc @loc end |
#res ⇒ Object
result of page processing been made in frame context
41 42 43 |
# File 'lib/rhack/page.rb', line 41 def res @res end |
#title(full = true) ⇒ Object
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
# File 'lib/rhack/page.rb', line 244 def title(full=true) if @data.nil? and !@failed and @body.b if full to_html unless defined? @doc if @doc.title.b @title = @doc.title else @title = @loc.href @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head') @title end else title true unless defined? @title if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40 @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…' elsif @title.size > 40 @short_title = @title[/.{1,30}\S*/][0..38]+'…' else @short_title = @title end end else @loc.href end end |
Instance Method Details
#at(selector_or_node, options = {}) ⇒ Object Also known as: first
354 355 356 357 358 359 360 361 362 363 364 |
# File 'lib/rhack/page.rb', line 354 def at(selector_or_node, ={}) if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ? selector_or_node : __at(selector_or_node) preresult = preprocess_search_result(preresult, [:preprocess]) block_given? ? yield(preresult) : preresult else node_is_missing!(selector_or_node, ) preresult end end |
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g. dict key1: value1, …
key2: value2, ...
276 277 278 |
# File 'lib/rhack/page.rb', line 276 def dict(hash) hash.is_a?(Hash) ? hash : Hash[hash] end |
#empty? ⇒ Boolean
59 60 61 |
# File 'lib/rhack/page.rb', line 59 def empty? !@data && !@body.b end |
#eval_js(frame = nil) ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/rhack/page.rb', line 203 def eval_js(frame=nil) eval_string "document.location = window.location = #{@loc.to_json}; document.URL = document.baseURI = document.documentURI = location.href; document.domain = location.host;" find("script").each {|n| L.debug n.text.strip if text = n.text.strip.b js[:write_output] = '' eval_string text if res = js[:write_output].b then n.after res end n.remove! elsif frame and n.src eval_string frame.get_cached n.src end } end |
#eval_string(str) ⇒ Object
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
# File 'lib/rhack/page.rb', line 220 def eval_string(str) @js ||= Johnson::Runtime.new L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{Curl.carier_thread}" begin @js.evaluate(str) rescue Johnson::Error => e L.warn e. L.debug { if m = e..match(/(\w+) is undefined|([\w.]+) is not a function/) L.clr.hl! str, /\b#{m[1] || m[2]}\b/ end "\n\t#{str}" } end end |
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path
300 301 302 303 304 305 306 307 308 309 |
# File 'lib/rhack/page.rb', line 300 def (link) case link when /^\w+:\/\// then link when /^\/\// then @loc.protocol + ':' + link when /^\// then @loc.root + link when /^\?/ then File.join(@loc.root, @loc.path) + link when /^#/ then File.join(@loc.root, @loc.fullpath) + link else File.join @loc.root, File.dirname(@loc.path), link end end |
#failed? ⇒ Boolean
override this in a subclass
93 94 95 |
# File 'lib/rhack/page.rb', line 93 def failed?(*) @curl_res.code != 200 end |
#find(selector_or_nodes, options = {}, &foreach) ⇒ Object Also known as: all
367 368 369 370 371 372 373 374 375 376 377 378 |
# File 'lib/rhack/page.rb', line 367 def find(selector_or_nodes, ={}, &foreach) preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ? selector_or_nodes : __find(selector_or_nodes) if preresult.size > 0 preresult = preprocess_search_results(preresult, [:preprocess]) foreach ? preresult.each(&foreach) : preresult else node_is_missing!(selector_or_nodes, ) preresult end end |
#flatten_dict(hash) ⇒ Object
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
# File 'lib/rhack/page.rb', line 281 def flatten_dict(hash) result = {} hash.each {|k, v| if k.is String and k[' '] k.split(' ').each_with_index {|k_unit, k_idx| result[k_unit.to_sym] = v[k_idx] } elsif k.is Array k.each_with_index {|k_unit, k_idx| result[k_unit.to_sym] = v[k_idx] } else result[k.to_sym] = v end } result end |
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 |
# File 'lib/rhack/page.rb', line 436 def form(form='form', hash={}, opts={}) form = "[action=#{@loc.path.inspect}]" if form == :self if form.is String form_node = at form raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form' else form_node = form end hash = form_node.inputs_all.merge!(hash) action = (form_node.action || @loc.path) if form_node['method'].downcase == 'post' [hash, form_node.enctype =~ /multipart/, action, opts] else action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b [action, opts] end end |
#get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object Also known as: link, get_href
406 407 408 409 410 411 412 413 414 415 416 417 |
# File 'lib/rhack/page.rb', line 406 def get_link(selector_or_node='a', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| unless href = node.href if node = node.find('a') href = node.href end end if href href end })) {|href| onfound && href ? onfound.call(href) : href} end |
#get_links(links = 'a') ⇒ Object Also known as: get_hrefs, links
def get_src(link=‘img’)
begin
link = at(link) && at(link).src if link.is String
rescue LibXML::XML::Error; nil
end
link if link
end
488 489 490 491 492 493 494 495 |
# File 'lib/rhack/page.rb', line 488 def get_links(links='a') begin links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String rescue LibXML::XML::Error links = [links] end links.map {|link| link}.uniq end |
#get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object Also known as: src
397 398 399 400 401 402 403 |
# File 'lib/rhack/page.rb', line 397 def get_src(selector_or_node='img', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| if src = node.src src end })) {|src| onfound && src ? onfound.call(src) : src} end |
#get_srcs(links = 'img') ⇒ Object Also known as: srcs
TODO: make into same form as #get_src and #map
470 471 472 473 474 475 476 477 |
# File 'lib/rhack/page.rb', line 470 def get_srcs(links='img') begin links = find(links).map {|e| e.src} if links.is String rescue LibXML::XML::Error links = [links] end links.map {|link| link}.uniq end |
#inspect ⇒ Object
73 74 75 76 77 78 79 80 |
# File 'lib/rhack/page.rb', line 73 def inspect sz = size if !@data.nil? "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>" else "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>" end end |
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’)
begin
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
rescue XML::Error; nil
end
link if link
end
507 508 509 |
# File 'lib/rhack/page.rb', line 507 def load_scripts(frame) frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js} end |
#map(selector_or_nodes, options = {}, &mapper) ⇒ Object
421 422 423 424 425 426 427 |
# File 'lib/rhack/page.rb', line 421 def map(selector_or_nodes, ={}, &mapper) mapping = find(selector_or_nodes, .merge(:preprocess => mapper)) unless [:compact] == false mapping = mapping.to_a.compact end mapping end |
#map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
429 430 431 |
# File 'lib/rhack/page.rb', line 429 def map_json(selector_or_nodes, ={}, &mapper) JsonString map(selector_or_nodes, , &mapper) end |
#parse(opts = {}) ⇒ Object
override this in a subclass MUST return self if successful MAY return false otherwise
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/rhack/page.rb', line 105 def parse(opts={}) if failed? failed! if opts[:json] or opts[:hash] @data = false end return self end if opts[:json] parse_json opts elsif opts[:hash] parse_hash opts elsif opts[:xml] parse_xml opts else parse_html opts end self end |
#process(c, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about value returned by #process
188 189 190 191 192 193 194 195 196 197 198 199 200 |
# File 'lib/rhack/page.rb', line 188 def process(c, opts={}) @loc = c.last_effective_url.parse:uri @curl = c @curl_res = c.res if retry? c.retry! return # callback will not proceed end L.debug "#{@loc.fullpath} -> #{@curl_res}" parse(opts) end |
#retry? ⇒ Boolean
override this in a subclass
98 99 100 |
# File 'lib/rhack/page.rb', line 98 def retry?(*) false end |
#size ⇒ Object
63 64 65 66 67 68 69 70 71 |
# File 'lib/rhack/page.rb', line 63 def size if @data.nil? (@body || '').size elsif @data == false 0 else @data.inspect.size end end |
#submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
453 454 455 456 457 458 459 460 461 462 463 464 |
# File 'lib/rhack/page.rb', line 453 def submit(form, frame, hash={}, opts={}, &callback) (opts[:headers] ||= {}).Referer ||= @loc.href if @loc query = form(form, hash, opts) curr_target, new_target = frame.loc.href, (query[2] || query[0]) if need_retargeting = (frame.static && curr_target != new_target) frame.retarget new_target end page = frame.exec(*query, &callback) frame.retarget curr_target, :forced if need_retargeting page end |
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #
384 385 386 387 388 389 |
# File 'lib/rhack/page.rb', line 384 def text(selector_or_node, ={}) if node = at(selector_or_node, ) txt = node.text.strip block_given? ? yield(txt) : txt end end |
#texts(hash, options = {}) ⇒ Object
391 392 393 394 395 |
# File 'lib/rhack/page.rb', line 391 def texts(hash, ={}) hash.map_values {|selector_or_node| text(selector_or_node, ) } end |
#to_html ⇒ Object
236 237 238 |
# File 'lib/rhack/page.rb', line 236 def to_html @doc = @body.to_html end |
#to_xml ⇒ Object
240 241 242 |
# File 'lib/rhack/page.rb', line 240 def to_xml @doc = @body.to_xml end |
#url ⇒ Object Also known as: href
86 87 88 |
# File 'lib/rhack/page.rb', line 86 def url @loc.href end |
#utf! ⇒ Object
82 83 84 |
# File 'lib/rhack/page.rb', line 82 def utf! @body.utf! end |