Class: WWMD::Scrape

Inherits:

Object

Object
WWMD::Scrape

show all

Defined in:: lib/wwmd/page/scrape.rb

Instance Attribute Summary collapse

#debug ⇒ Object

Returns the value of attribute debug.
#hdoc ⇒ Object readonly

Returns the value of attribute hdoc.
#jlinks ⇒ Object

links to javascript includes.
#links ⇒ Object

links found on page.
#warn ⇒ Object

Returns the value of attribute warn.

Instance Method Summary collapse

#default_reject_links ⇒ Object

default reject links (override using reject_links in helper script).
#for_comments ⇒ Object

scan page for comment fields.
#for_forms ⇒ Object

return an array of Form objects for forms on page.
#for_javascript_links ⇒ Object

scrape the page for <script src=“”> tags.
#for_javascript_redirect ⇒ Object

scrape the page for a script tag that contains a bare location.href tag (to redirect the page).
#for_links(reject = true) ⇒ Object

use xpath searches to get * //a href * //area href * //frame src * //iframe src * //form action * //meta refresh content urls then get //script tags and regexp out links in javascript function calls from elem.inner_html.
#for_meta_refresh ⇒ Object

scrape the page for a meta refresh tag and return the url from the contents attribute or nil.
#initialize(page = '<>') ⇒ Scrape constructor

create a new scrape object using passed HTML.
#reject_links ⇒ Object

NEED to move this to external configuration.
#reset(page) ⇒ Object

reset this scrape object (called by WWMD::Page).
#urls_from_helper ⇒ Object

define an urls_from_helper method in your task specific script.
#urls_from_regexp(content, re, split = 0) ⇒ Object

scan the passed string for the configured regular expressions and return them as an array.
#urls_from_xpath(xpath, attr) ⇒ Object

xpath search for tags and return the passed attribute urls_from_xpath(“//a”,“href”).
#warnings ⇒ Object

renamed class variable (for backward compat).

Constructor Details

#initialize(page = '<>') ⇒ `Scrape`

create a new scrape object using passed HTML

# File 'lib/wwmd/page/scrape.rb', line 22

def initialize(page='<>')
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
  @debug = false
  @warn = false
end

Instance Attribute Details

#debug ⇒ `Object`

Returns the value of attribute debug.



15
16
17

# File 'lib/wwmd/page/scrape.rb', line 15

def debug
  @debug
end

#hdoc ⇒ `Object` (readonly)

Returns the value of attribute hdoc.



19
20
21

# File 'lib/wwmd/page/scrape.rb', line 19

def hdoc
  @hdoc
end

#jlinks ⇒ `Object`

links to javascript includes



18
19
20

# File 'lib/wwmd/page/scrape.rb', line 18

def jlinks
  @jlinks
end

#links ⇒ `Object`

links found on page



17
18
19

# File 'lib/wwmd/page/scrape.rb', line 17

def links
  @links
end

#warn ⇒ `Object`

Returns the value of attribute warn.



16
17
18

# File 'lib/wwmd/page/scrape.rb', line 16

def warn
  @warn
end

Instance Method Details

#default_reject_links ⇒ `Object`

default reject links (override using reject_links in helper script)

# File 'lib/wwmd/page/scrape.rb', line 72

def default_reject_links
  @links.reject! do |url|
    url.nil? ||
    url.extname == ".css" ||
    url.extname == ".pdf" ||
    url =~ /javascript:/i ||
    url =~ /mailto:/i ||
    url =~ /[\[\]]/ ||
    url =~ /^#/
  end
end

#for_comments ⇒ `Object`

scan page for comment fields



148
149
150

# File 'lib/wwmd/page/scrape.rb', line 148

def for_comments
  @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
end

#for_forms ⇒ `Object`

return an array of Form objects for forms on page

# File 'lib/wwmd/page/scrape.rb', line 85

def for_forms
  ret = []
  @hdoc.search("//form").each { |f| ret << Form.new(f) }
  ret
end

#for_javascript_links ⇒ `Object`

scrape the page for <script src=“”> tags

# File 'lib/wwmd/page/scrape.rb', line 140

def for_javascript_links
  urls = []
  @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
  urls.reject! { |url| File.extname(url).clip != ".js" }
  return urls
end

#for_javascript_redirect ⇒ `Object`

scrape the page for a script tag that contains a bare location.href tag (to redirect the page)

# File 'lib/wwmd/page/scrape.rb', line 173

def for_javascript_redirect
  redirs = []
  @hdoc.search("//script").each do |scr|
    scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
  end
  if redirs.size > 1
    STDERR.puts "PARSE ERROR: more than one javascript redirect"
    return "ERR"
  end
  return redirs.first if not redirs.empty?
  return nil
end

#for_links(reject = true) ⇒ `Object`

use xpath searches to get

//a href
//area href
//frame src
//iframe src
//form action
//meta refresh content urls

then get //script tags and regexp out links in javascript function calls from elem.inner_html

# File 'lib/wwmd/page/scrape.rb', line 100

def for_links(reject=true)
  self.urls_from_xpath("//a","href").each { |url| @links << url };      # get <a href=""> elements
  self.urls_from_xpath("//area","href").each { |url| @links << url };   # get <area href=""> elements
  self.urls_from_xpath("//frame","src").each { |url| @links << url };   # get <frame src=""> elements
  self.urls_from_xpath("//iframe","src").each { |url| @links << url };  # get <iframe src=""> elements
  self.urls_from_xpath("//form","action").each { |url| @links << url }; # get <form action=""> elements

  # <meta> refresh
  @hdoc.search("//meta").each do |meta|
    next if meta['http-equiv'] != "refresh"
    next if not (content = meta['content'].split(/=/)[1])
    @links << content.strip
  end

  # add urls from onclick handlers
  @hdoc.search("*[@onclick]").each do |onclick|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(onclick['onclick'],re).each do |url|
        @links << url
      end
    end
  end

  # add urls_from_regexp (limit to <script> tags (elem.inner_html))
  @hdoc.search("//script").each do |scr|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
    end
  end

  # re-define urls_from_helper in what you mix in
  begin
    self.urls_from_helper
  end

  self.reject_links; # reject links we don't care about
  return @links
end

#for_meta_refresh ⇒ `Object`

scrape the page for a meta refresh tag and return the url from the contents attribute or nil

# File 'lib/wwmd/page/scrape.rb', line 153

def for_meta_refresh
  has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
  if has_mr
    urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
    if urls.size > 1
      STDERR.puts "PARSE ERROR: more than one meta refresh tag"
      return "ERR"
    end
    k,v = urls.first.split("=",2)
    if k.upcase.strip != "URL"
      STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
      return "ERR"
    end
    return v.strip
  else
    return nil
  end
end

#reject_links ⇒ `Object`

NEED to move this to external configuration

list of urls we don’t care to store in our links list

# File 'lib/wwmd/page/scrape.rb', line 66

def reject_links
  putw "WARN: override reject_links in helper script" if @warn
  default_reject_links
end

#reset(page) ⇒ `Object`

reset this scrape object (called by WWMD::Page)

# File 'lib/wwmd/page/scrape.rb', line 31

def reset(page)
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
end

#urls_from_helper ⇒ `Object`

define an urls_from_helper method in your task specific script

# File 'lib/wwmd/page/scrape.rb', line 192

def urls_from_helper
  putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
  return nil
end

#urls_from_regexp(content, re, split = 0) ⇒ `Object`

scan the passed string for the configured regular expressions and return them as an array

# File 'lib/wwmd/page/scrape.rb', line 39

def urls_from_regexp(content,re,split=0)
  ret = []
  scrape = content.scan(re)
  scrape.each do |url|
    # cheat and take split string(,)[split]
    add = url.to_s.split(',')[split].gsub(/['"]/, '')
    next if (add == '' || add.nil?)
    ret << add
  end
  return ret
end

#urls_from_xpath(xpath, attr) ⇒ `Object`

xpath search for tags and return the passed attribute

urls_from_xpath("//a","href")

# File 'lib/wwmd/page/scrape.rb', line 53

def urls_from_xpath(xpath,attr)
  ret = []
  @hdoc.search(xpath).each do |elem|
    url = elem[attr]
    next if url.empty?
    ret << url.strip
  end
  return ret
end

#warnings ⇒ `Object`

renamed class variable (for backward compat)



187
188
189

# File 'lib/wwmd/page/scrape.rb', line 187

def warnings#:nodoc:
  return @warn
end

Class: WWMD::Scrape

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page = '<>') ⇒ Scrape

Instance Attribute Details

#debug ⇒ Object

#hdoc ⇒ Object (readonly)

#jlinks ⇒ Object

#links ⇒ Object

#warn ⇒ Object

Instance Method Details

#default_reject_links ⇒ Object

#for_comments ⇒ Object

#for_forms ⇒ Object

#for_javascript_links ⇒ Object

#for_javascript_redirect ⇒ Object

#for_links(reject = true) ⇒ Object

#for_meta_refresh ⇒ Object

#reject_links ⇒ Object

#reset(page) ⇒ Object

#urls_from_helper ⇒ Object

#urls_from_regexp(content, re, split = 0) ⇒ Object

#urls_from_xpath(xpath, attr) ⇒ Object

#warnings ⇒ Object