Class: RHACK::Page
Overview
Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, … ) ) => Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, …
Direct Known Subclasses
Constant Summary collapse
- @@ignore =
for johnson
/google|_gat|tracker|adver/i
Instance Attribute Summary collapse
-
#curl_res ⇒ Object
readonly
Returns the value of attribute curl_res.
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#failed ⇒ Object
readonly
Returns the value of attribute failed.
-
#hash ⇒ Object
readonly
Returns the value of attribute hash.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#js ⇒ Object
readonly
Returns the value of attribute js.
-
#loc ⇒ Object
readonly
Returns the value of attribute loc.
-
#res ⇒ Object
result of page processing been made in frame context.
- #title(full = true) ⇒ Object
Instance Method Summary collapse
- #at(selector_or_node, options = {}) ⇒ Object (also: #first)
-
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
- #empty? ⇒ Boolean
- #eval_js(frame = nil) ⇒ Object
- #eval_string(str) ⇒ Object
-
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path.
- #find(selector_or_nodes, options = {}, &foreach) ⇒ Object (also: #all)
- #flatten_dict(hash) ⇒ Object
-
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #.
- #get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object (also: #link, #get_href)
-
#get_links(links = 'a') ⇒ Object
(also: #get_hrefs, #links)
def get_src(link=‘img’) begin link = at(link) && at(link).src if link.is String rescue XML::Error; nil end expand_link link if link end.
- #get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object (also: #src)
-
#get_srcs(links = 'img') ⇒ Object
(also: #srcs)
TODO: make into same form as #get_src and #map.
- #html!(encoding = 'UTF-8') ⇒ Object
-
#initialize(obj = '', loc = Hash.new(''), js = Johnson::Runtime.browser||Johnson::Runtime.new) ⇒ Page
constructor
A new instance of Page.
- #inspect ⇒ Object
-
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’) begin link = at(link) && (at(link).href || at(link+‘//a’).href) if link.is String rescue XML::Error; nil end expand_link link if link end.
- #map(selector_or_nodes, options = {}, &mapper) ⇒ Object
- #map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
-
#process(c, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about value returned by #process.
- #submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
-
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #.
- #texts(hash, options = {}) ⇒ Object
- #to_doc ⇒ Object
- #url ⇒ Object (also: #href)
Constructor Details
#initialize(obj = '', loc = Hash.new(''), js = Johnson::Runtime.browser||Johnson::Runtime.new) ⇒ Page
Returns a new instance of Page.
42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/rhack/page.rb', line 42 def initialize(obj='', loc=Hash.new(''), js=Johnson::Runtime.browser||Johnson::Runtime.new) loc = loc.parse:uri if !loc.is Hash @js = js if obj.is Curl::Easy or obj.kinda Scout c = obj.kinda(Scout) ? obj.http : obj @html = '' # just (c, loc) would pass to #process opts variable that returns '' on any key process(c, loc.b || {}) else @html = obj @loc = loc end end |
Instance Attribute Details
#curl_res ⇒ Object (readonly)
Returns the value of attribute curl_res.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def curl_res @curl_res end |
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def doc @doc end |
#failed ⇒ Object (readonly)
Returns the value of attribute failed.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def failed @failed end |
#hash ⇒ Object (readonly)
Returns the value of attribute hash.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def hash @hash end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def html @html end |
#js ⇒ Object (readonly)
Returns the value of attribute js.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def js @js end |
#loc ⇒ Object (readonly)
Returns the value of attribute loc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def loc @loc end |
#res ⇒ Object
result of page processing been made in frame context
38 39 40 |
# File 'lib/rhack/page.rb', line 38 def res @res end |
#title(full = true) ⇒ Object
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/rhack/page.rb', line 156 def title(full=true) if @hash.nil? and !@failed and @html.b if full to_doc unless defined? @doc if @doc.title.b @title = @doc.title else @title = @loc.href @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head') @title end else title true unless defined? @title if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40 @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…' elsif @title.size > 40 @short_title = @title[/.{1,30}\S*/][0..38]+'…' else @short_title = @title end end else @loc.href end end |
Instance Method Details
#at(selector_or_node, options = {}) ⇒ Object Also known as: first
262 263 264 265 266 267 268 269 270 271 272 |
# File 'lib/rhack/page.rb', line 262 def at(selector_or_node, ={}) if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ? selector_or_node : __at(selector_or_node) preresult = preprocess_search_result(preresult, [:preprocess]) block_given? ? yield(preresult) : preresult else node_is_missing!(selector_or_node, ) preresult end end |
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g. dict key1: value1, …
key2: value2, ...
188 189 190 |
# File 'lib/rhack/page.rb', line 188 def dict(hash) hash.is_a?(Hash) ? hash : Hash[hash] end |
#empty? ⇒ Boolean
56 57 58 |
# File 'lib/rhack/page.rb', line 56 def empty? !(@hash.nil? ? @html : @hash).b end |
#eval_js(frame = nil) ⇒ Object
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# File 'lib/rhack/page.rb', line 119 def eval_js(frame=nil) eval_string "document.location = window.location = #{@loc.to_json}; document.URL = document.baseURI = document.documentURI = location.href; document.domain = location.host;" find("script").each {|n| L.debug n.text.strip if text = n.text.strip.b js[:write_output] = '' eval_string text if res = js[:write_output].b then n.after res end n.remove! elsif frame and n.src eval_string frame.get_cached n.src end } end |
#eval_string(str) ⇒ Object
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# File 'lib/rhack/page.rb', line 136 def eval_string(str) @js ||= Johnson::Runtime.new L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{Curl.carier_thread}" begin @js.evaluate(str) rescue Johnson::Error => e L.warn e. L.debug { if m = e..match(/(\w+) is undefined|([\w.]+) is not a function/) L.clr.hl! str, /\b#{m[1] || m[2]}\b/ end "\n\t#{str}" } end end |
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path
208 209 210 211 212 213 214 215 216 217 |
# File 'lib/rhack/page.rb', line 208 def (link) case link when /^\w+:\/\// then link when /^\/\// then @loc.protocol + ':' + link when /^\// then @loc.root + link when /^\?/ then File.join(@loc.root, @loc.path) + link when /^#/ then File.join(@loc.root, @loc.fullpath) + link else File.join @loc.root, File.dirname(@loc.path), link end end |
#find(selector_or_nodes, options = {}, &foreach) ⇒ Object Also known as: all
275 276 277 278 279 280 281 282 283 284 285 286 |
# File 'lib/rhack/page.rb', line 275 def find(selector_or_nodes, ={}, &foreach) preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ? selector_or_nodes : __find(selector_or_nodes) if preresult.size > 0 preresult = preprocess_search_results(preresult, [:preprocess]) foreach ? preresult.each(&foreach) : preresult else node_is_missing!(selector_or_nodes, ) preresult end end |
#flatten_dict(hash) ⇒ Object
193 194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/rhack/page.rb', line 193 def flatten_dict(hash) result = {} hash.each {|k, v| if k.is String and k[' '] k.split(' ').each_with_index {|k_unit, k_idx| result[k_unit.to_sym] = v[k_idx] } else result[k.to_sym] = v end } result end |
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 |
# File 'lib/rhack/page.rb', line 344 def form(form='form', hash={}, opts={}) form = "[action=#{@loc.path.inspect}]" if form == :self if form.is String form_node = at form raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form' else form_node = form end hash = form_node.inputs_all.merge!(hash) action = (form_node.action || @loc.path) if form_node['method'].downcase == 'post' [hash, form_node.enctype =~ /multipart/, action, opts] else action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b [action, opts] end end |
#get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object Also known as: link, get_href
314 315 316 317 318 319 320 321 322 323 324 325 |
# File 'lib/rhack/page.rb', line 314 def get_link(selector_or_node='a', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| unless href = node.href if node = node.find('a') href = node.href end end if href href end })) {|href| onfound && href ? onfound.call(href) : href} end |
#get_links(links = 'a') ⇒ Object Also known as: get_hrefs, links
def get_src(link=‘img’)
begin
link = at(link) && at(link).src if link.is String
rescue XML::Error; nil
end
link if link
end
396 397 398 399 400 401 402 403 |
# File 'lib/rhack/page.rb', line 396 def get_links(links='a') begin links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String rescue XML::Error links = [links] end links.map {|link| link}.uniq end |
#get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object Also known as: src
305 306 307 308 309 310 311 |
# File 'lib/rhack/page.rb', line 305 def get_src(selector_or_node='img', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| if src = node.src src end })) {|src| onfound && src ? onfound.call(src) : src} end |
#get_srcs(links = 'img') ⇒ Object Also known as: srcs
TODO: make into same form as #get_src and #map
378 379 380 381 382 383 384 385 |
# File 'lib/rhack/page.rb', line 378 def get_srcs(links='img') begin links = find(links).map {|e| e.src} if links.is String rescue XML::Error links = [links] end links.map {|link| link}.uniq end |
#html!(encoding = 'UTF-8') ⇒ Object
68 69 70 |
# File 'lib/rhack/page.rb', line 68 def html!(encoding='UTF-8') @html.force_encoding(encoding) end |
#inspect ⇒ Object
60 61 62 63 64 65 66 |
# File 'lib/rhack/page.rb', line 60 def inspect if !@hash.nil? "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>" else "<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>" end end |
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’)
begin
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
rescue XML::Error; nil
end
link if link
end
415 416 417 |
# File 'lib/rhack/page.rb', line 415 def load_scripts(frame) frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js} end |
#map(selector_or_nodes, options = {}, &mapper) ⇒ Object
329 330 331 332 333 334 335 |
# File 'lib/rhack/page.rb', line 329 def map(selector_or_nodes, ={}, &mapper) mapping = find(selector_or_nodes, .merge(:preprocess => mapper)) unless [:compact] == false mapping = mapping.to_a.compact end mapping end |
#map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
337 338 339 |
# File 'lib/rhack/page.rb', line 337 def map_json(selector_or_nodes, ={}, &mapper) JsonString map(selector_or_nodes, , &mapper) end |
#process(c, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about value returned by #process
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/rhack/page.rb', line 77 def process(c, opts={}) @loc = c.last_effective_url.parse:uri @curl_res = c.res L.debug "#{@loc.fullpath} -> #{@curl_res}" if @curl_res.code == 200 body = @curl_res.body if opts[:json] @json = true @hash = begin; body.from_json rescue StandardError false end if !@hash or @hash.is String L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}" @html = body; to_doc @hash = false end elsif opts[:hash] if body.inline @hash = body.to_params else @hash = false L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}" @html = body; to_doc end else @html = body.xml_to_utf to_doc if opts[:eval] load_scripts opts[:load_scripts] eval_js end end elsif !(opts[:json] or opts[:hash]) @html = @curl_res.body @failed = @curl_res.code end self end |
#submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
361 362 363 364 365 366 367 368 369 370 371 372 |
# File 'lib/rhack/page.rb', line 361 def submit(form, frame, hash={}, opts={}, &callback) (opts[:headers] ||= {}).Referer ||= @loc.href if @loc query = form(form, hash, opts) curr_target, new_target = frame.loc.href, (query[2] || query[0]) if need_retargeting = (frame.static && curr_target != new_target) frame.retarget new_target end page = frame.exec(*query, &callback) frame.retarget curr_target, :forced if need_retargeting page end |
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #
292 293 294 295 296 297 |
# File 'lib/rhack/page.rb', line 292 def text(selector_or_node, ={}) if node = at(selector_or_node, ) txt = node.text.strip block_given? ? yield(txt) : txt end end |
#texts(hash, options = {}) ⇒ Object
299 300 301 302 303 |
# File 'lib/rhack/page.rb', line 299 def texts(hash, ={}) hash.map_values {|selector_or_node| text(selector_or_node, ) } end |
#to_doc ⇒ Object
152 153 154 |
# File 'lib/rhack/page.rb', line 152 def to_doc @doc = @html.to_doc :forceutf end |
#url ⇒ Object Also known as: href
72 |
# File 'lib/rhack/page.rb', line 72 def url() @loc.href end |