Class: RHACK::Page

Inherits:
Object show all
Defined in:
lib/rhack/page.rb

Overview

Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, … ) ) => Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, …

Direct Known Subclasses

CodeIndiffirentPage, HashPage, HtmlPage, JsonPage, XmlPage

Constant Summary collapse

@@ignore =

for johnson

/google|_gat|tracker|adver/i

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page

Frame calls it with no args



46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/rhack/page.rb', line 46

def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new))
  loc = loc.parse:uri if !loc.is Hash
  @js = js
  if obj.is Curl::Easy or obj.kinda Scout
    c = obj.kinda(Scout) ? obj.http : obj
    # just (c, loc) would pass to #process opts variable that returns '' on any key
    process(c, loc.b || {})
  else
    @body = obj
    @loc = loc
  end
end

Instance Attribute Details

#bodyObject (readonly) Also known as: html

Returns the value of attribute body.



36
37
38
# File 'lib/rhack/page.rb', line 36

def body
  @body
end

#curlObject (readonly)

Returns the value of attribute curl.



36
37
38
# File 'lib/rhack/page.rb', line 36

def curl
  @curl
end

#curl_resObject (readonly)

Returns the value of attribute curl_res.



36
37
38
# File 'lib/rhack/page.rb', line 36

def curl_res
  @curl_res
end

#dataObject (readonly) Also known as: hash

Returns the value of attribute data.



36
37
38
# File 'lib/rhack/page.rb', line 36

def data
  @data
end

#docObject (readonly)

Returns the value of attribute doc.



36
37
38
# File 'lib/rhack/page.rb', line 36

def doc
  @doc
end

#failedObject (readonly)

Returns the value of attribute failed.



36
37
38
# File 'lib/rhack/page.rb', line 36

def failed
  @failed
end

#jsObject (readonly)

Returns the value of attribute js.



36
37
38
# File 'lib/rhack/page.rb', line 36

def js
  @js
end

#locObject (readonly)

Returns the value of attribute loc.



36
37
38
# File 'lib/rhack/page.rb', line 36

def loc
  @loc
end

#resObject

result of page processing been made in frame context



41
42
43
# File 'lib/rhack/page.rb', line 41

def res
  @res
end

#title(full = true) ⇒ Object



244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/rhack/page.rb', line 244

def title(full=true)
  if @data.nil? and !@failed and @body.b
    if full
      to_html unless defined? @doc
      if @doc.title.b
        @title = @doc.title
      else
        @title = @loc.href
        @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
        @title
      end
    else
      title true unless defined? @title
      if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
        @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+''
      elsif @title.size > 40
        @short_title = @title[/.{1,30}\S*/][0..38]+''
      else
        @short_title = @title
      end
    end
  else
    @loc.href
  end
end

Instance Method Details

#at(selector_or_node, options = {}) ⇒ Object Also known as: first



354
355
356
357
358
359
360
361
362
363
364
# File 'lib/rhack/page.rb', line 354

def at(selector_or_node, options={})
  if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ? 
      selector_or_node : __at(selector_or_node)
      
    preresult = preprocess_search_result(preresult, options[:preprocess])
    block_given? ? yield(preresult) : preresult
  else
    node_is_missing!(selector_or_node, options)
    preresult
  end
end

#dict(hash) ⇒ Object

hook to create even-looked lines defining a hash in my Verdana 10px, e.g. dict key1: value1, …

key2: value2, ...


276
277
278
# File 'lib/rhack/page.rb', line 276

def dict(hash)
  hash.is_a?(Hash) ? hash : Hash[hash]
end

#empty?Boolean

Returns:

  • (Boolean)


59
60
61
# File 'lib/rhack/page.rb', line 59

def empty?
  !@data && !@body.b
end

#eval_js(frame = nil) ⇒ Object



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/rhack/page.rb', line 203

def eval_js(frame=nil)
  eval_string "document.location = window.location = #{@loc.to_json};
  document.URL = document.baseURI = document.documentURI = location.href;
  document.domain = location.host;"
  find("script").each {|n|
    L.debug n.text.strip
    if text = n.text.strip.b
      js[:write_output] = ''
      eval_string text
      if res = js[:write_output].b then n.after res end
      n.remove!
    elsif frame and n.src
      eval_string frame.get_cached expand_link n.src
    end
  }
end

#eval_string(str) ⇒ Object



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/rhack/page.rb', line 220

def eval_string(str)
  @js ||= Johnson::Runtime.new
  L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{Curl.carier_thread}"
  begin
    @js.evaluate(str)
  rescue Johnson::Error => e
    L.warn e.message
    L.debug {
      if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
        L.clr.hl! str, /\b#{m[1] || m[2]}\b/
      end
      "\n\t#{str}"
    }
  end
end

makes a relative path being on this page into an absolute path



300
301
302
303
304
305
306
307
308
309
# File 'lib/rhack/page.rb', line 300

def expand_link(link)
  case link
    when /^\w+:\/\// then link
    when /^\/\// then @loc.protocol + ':' + link
    when /^\// then @loc.root + link
    when /^\?/ then File.join(@loc.root, @loc.path) + link
    when /^#/ then File.join(@loc.root, @loc.fullpath) + link
    else File.join @loc.root, File.dirname(@loc.path), link
  end
end

#failed?Boolean

override this in a subclass

Returns:

  • (Boolean)


93
94
95
# File 'lib/rhack/page.rb', line 93

def failed?(*)
  @curl_res.code != 200
end

#find(selector_or_nodes, options = {}, &foreach) ⇒ Object Also known as: all



367
368
369
370
371
372
373
374
375
376
377
378
# File 'lib/rhack/page.rb', line 367

def find(selector_or_nodes, options={}, &foreach)
  preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ?
    selector_or_nodes : __find(selector_or_nodes)
    
  if preresult.size > 0
    preresult = preprocess_search_results(preresult, options[:preprocess])
    foreach ? preresult.each(&foreach) : preresult
  else
    node_is_missing!(selector_or_nodes, options)
    preresult
  end
end

#flatten_dict(hash) ⇒ Object

maps lastname’ => tuple into => tuple, :lastname => tuple



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'lib/rhack/page.rb', line 281

def flatten_dict(hash)
  result = {}
  hash.each {|k, v|
    if k.is String and k[' ']
      k.split(' ').each_with_index {|k_unit, k_idx|
        result[k_unit.to_sym] = v[k_idx]
      }
    elsif k.is Array
      k.each_with_index {|k_unit, k_idx|
        result[k_unit.to_sym] = v[k_idx]
      }
    else
      result[k.to_sym] = v
    end
  }
  result
end

#form(form = 'form', hash = {}, opts = {}) ⇒ Object

FORMS #



436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
# File 'lib/rhack/page.rb', line 436

def form(form='form', hash={}, opts={})
  form = "[action=#{@loc.path.inspect}]" if form == :self
  if form.is String
         form_node = at form
         raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
  else form_node = form
  end
  hash = form_node.inputs_all.merge!(hash)
  action = expand_link(form_node.action || @loc.path)
  if form_node['method'].downcase == 'post'
    [hash, form_node.enctype =~ /multipart/, action, opts]
  else
    action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
    [action, opts]
  end
end


406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/rhack/page.rb', line 406

def get_link(selector_or_node='a', options={}, &onfound)
  at(selector_or_node, options.merge(:preprocess => lambda {|node|
    unless href = node.href
      if node = node.find('a')
        href = node.href
      end
    end
    if href
      expand_link href
    end
  })) {|href| onfound && href ? onfound.call(href) : href}
end

def get_src(link=‘img’)

begin
  link = at(link) && at(link).src if link.is String
rescue LibXML::XML::Error; nil
end
expand_link link if link

end



488
489
490
491
492
493
494
495
# File 'lib/rhack/page.rb', line 488

def get_links(links='a')
  begin
    links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
  rescue LibXML::XML::Error
    links = [links]
  end
  links.map {|link| expand_link link}.uniq
end

#get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object Also known as: src



397
398
399
400
401
402
403
# File 'lib/rhack/page.rb', line 397

def get_src(selector_or_node='img', options={}, &onfound)
  at(selector_or_node, options.merge(:preprocess => lambda {|node|
    if src = node.src
      expand_link src
    end
  })) {|src| onfound && src ? onfound.call(src) : src}
end

#get_srcs(links = 'img') ⇒ Object Also known as: srcs

TODO: make into same form as #get_src and #map



470
471
472
473
474
475
476
477
# File 'lib/rhack/page.rb', line 470

def get_srcs(links='img')
  begin
    links = find(links).map {|e| e.src} if links.is String
  rescue LibXML::XML::Error
    links = [links]
  end
  links.map {|link| expand_link link}.uniq
end

#inspectObject



73
74
75
76
77
78
79
80
# File 'lib/rhack/page.rb', line 73

def inspect
  sz = size
  if !@data.nil?
    "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
  else
    "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
  end
end

#load_scripts(frame) ⇒ Object

def get_link(link=‘a’)

begin
  link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
rescue XML::Error; nil
end
expand_link link if link

end



507
508
509
# File 'lib/rhack/page.rb', line 507

def load_scripts(frame)
  frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
end

#map(selector_or_nodes, options = {}, &mapper) ⇒ Object



421
422
423
424
425
426
427
# File 'lib/rhack/page.rb', line 421

def map(selector_or_nodes, options={}, &mapper)
  mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
  unless options[:compact] == false
    mapping = mapping.to_a.compact
  end
  mapping
end

#map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object



429
430
431
# File 'lib/rhack/page.rb', line 429

def map_json(selector_or_nodes, options={}, &mapper)
  JsonString map(selector_or_nodes, options, &mapper)
end

#parse(opts = {}) ⇒ Object

override this in a subclass MUST return self if successful MAY return false otherwise



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/rhack/page.rb', line 105

def parse(opts={})
  if failed?
    failed!
    if opts[:json] or opts[:hash]
      @data = false
    end
    return self
  end
  
  if opts[:json]
    parse_json opts
  elsif opts[:hash]
    parse_hash opts
  elsif opts[:xml]
    parse_xml opts
  else
    parse_html opts
  end
  
  self
end

#process(c, opts = {}) ⇒ Object

We can then alternate #process in Page subclasses Frame doesn’t mind about value returned by #process



188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/rhack/page.rb', line 188

def process(c, opts={})
  @loc = c.last_effective_url.parse:uri
  @curl = c
  @curl_res = c.res
  
  if retry?
    c.retry!
    return # callback will not proceed
  end
  
  L.debug "#{@loc.fullpath} -> #{@curl_res}"
  parse(opts)
end

#retry?Boolean

override this in a subclass

Returns:

  • (Boolean)


98
99
100
# File 'lib/rhack/page.rb', line 98

def retry?(*)
  false
end

#sizeObject



63
64
65
66
67
68
69
70
71
# File 'lib/rhack/page.rb', line 63

def size
  if @data.nil?
    (@body || '').size
  elsif @data == false
    0
  else
    @data.inspect.size
  end
end

#submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object



453
454
455
456
457
458
459
460
461
462
463
464
# File 'lib/rhack/page.rb', line 453

def submit(form, frame, hash={}, opts={}, &callback)
  (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
  query = form(form, hash, opts)
  
  curr_target, new_target = frame.loc.href, (query[2] || query[0])
  if need_retargeting = (frame.static && curr_target != new_target)
    frame.retarget new_target
  end
  page = frame.exec(*query, &callback)
  frame.retarget curr_target, :forced if need_retargeting
  page
end

#text(selector_or_node, options = {}) ⇒ Object

FINDERS PREPROCESSORS #



384
385
386
387
388
389
# File 'lib/rhack/page.rb', line 384

def text(selector_or_node, options={})
  if node = at(selector_or_node, options)
    txt = node.text.strip
    block_given? ? yield(txt) : txt
  end
end

#texts(hash, options = {}) ⇒ Object



391
392
393
394
395
# File 'lib/rhack/page.rb', line 391

def texts(hash, options={})
  hash.map_values {|selector_or_node|
    text(selector_or_node, options)
  }
end

#to_htmlObject



236
237
238
# File 'lib/rhack/page.rb', line 236

def to_html
  @doc = @body.to_html
end

#to_xmlObject



240
241
242
# File 'lib/rhack/page.rb', line 240

def to_xml
  @doc = @body.to_xml
end

#urlObject Also known as: href



86
87
88
# File 'lib/rhack/page.rb', line 86

def url
  @loc.href
end

#utf!Object



82
83
84
# File 'lib/rhack/page.rb', line 82

def utf!
  @body.utf!
end