Class: WWMD::Scrape

Inherits:
Object
  • Object
show all
Defined in:
lib/wwmd/page/scrape.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page = '<>') ⇒ Scrape

create a new scrape object using passed HTML



22
23
24
25
26
27
28
# File 'lib/wwmd/page/scrape.rb', line 22

def initialize(page='<>')
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
  @debug = false
  @warn = false
end

Instance Attribute Details

#debugObject

Returns the value of attribute debug.



15
16
17
# File 'lib/wwmd/page/scrape.rb', line 15

def debug
  @debug
end

#hdocObject (readonly)

Returns the value of attribute hdoc.



19
20
21
# File 'lib/wwmd/page/scrape.rb', line 19

def hdoc
  @hdoc
end

links to javascript includes



18
19
20
# File 'lib/wwmd/page/scrape.rb', line 18

def jlinks
  @jlinks
end

links found on page



17
18
19
# File 'lib/wwmd/page/scrape.rb', line 17

def links
  @links
end

#warnObject

Returns the value of attribute warn.



16
17
18
# File 'lib/wwmd/page/scrape.rb', line 16

def warn
  @warn
end

Instance Method Details

default reject links (override using reject_links in helper script)



72
73
74
75
76
77
78
79
80
81
82
# File 'lib/wwmd/page/scrape.rb', line 72

def default_reject_links
  @links.reject! do |url|
    url.nil? ||
    url.extname == ".css" ||
    url.extname == ".pdf" ||
    url =~ /javascript:/i ||
    url =~ /mailto:/i ||
    url =~ /[\[\]]/ ||
    url =~ /^#/
  end
end

#for_commentsObject

scan page for comment fields



148
149
150
# File 'lib/wwmd/page/scrape.rb', line 148

def for_comments
  @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
end

#for_formsObject

return an array of Form objects for forms on page



85
86
87
88
89
# File 'lib/wwmd/page/scrape.rb', line 85

def for_forms
  ret = []
  @hdoc.search("//form").each { |f| ret << Form.new(f) }
  ret
end

scrape the page for <script src=“”> tags



140
141
142
143
144
145
# File 'lib/wwmd/page/scrape.rb', line 140

def for_javascript_links
  urls = []
  @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
  urls.reject! { |url| File.extname(url).clip != ".js" }
  return urls
end

#for_javascript_redirectObject

scrape the page for a script tag that contains a bare location.href tag (to redirect the page)



173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/wwmd/page/scrape.rb', line 173

def for_javascript_redirect
  redirs = []
  @hdoc.search("//script").each do |scr|
    scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
  end
  if redirs.size > 1
    STDERR.puts "PARSE ERROR: more than one javascript redirect"
    return "ERR"
  end
  return redirs.first if not redirs.empty?
  return nil
end

use xpath searches to get

  • //a href

  • //area href

  • //frame src

  • //iframe src

  • //form action

  • //meta refresh content urls

then get //script tags and regexp out links in javascript function calls from elem.inner_html



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/wwmd/page/scrape.rb', line 100

def for_links(reject=true)
  self.urls_from_xpath("//a","href").each { |url| @links << url };      # get <a href=""> elements
  self.urls_from_xpath("//area","href").each { |url| @links << url };   # get <area href=""> elements
  self.urls_from_xpath("//frame","src").each { |url| @links << url };   # get <frame src=""> elements
  self.urls_from_xpath("//iframe","src").each { |url| @links << url };  # get <iframe src=""> elements
  self.urls_from_xpath("//form","action").each { |url| @links << url }; # get <form action=""> elements

  # <meta> refresh
  @hdoc.search("//meta").each do |meta|
    next if meta['http-equiv'] != "refresh"
    next if not (content = meta['content'].split(/=/)[1])
    @links << content.strip
  end

  # add urls from onclick handlers
  @hdoc.search("*[@onclick]").each do |onclick|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(onclick['onclick'],re).each do |url|
        @links << url
      end
    end
  end

  # add urls_from_regexp (limit to <script> tags (elem.inner_html))
  @hdoc.search("//script").each do |scr|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
    end
  end

  # re-define urls_from_helper in what you mix in
  begin
    self.urls_from_helper
  end

  self.reject_links; # reject links we don't care about
  return @links
end

#for_meta_refreshObject

scrape the page for a meta refresh tag and return the url from the contents attribute or nil



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/wwmd/page/scrape.rb', line 153

def for_meta_refresh
  has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
  if has_mr
    urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
    if urls.size > 1
      STDERR.puts "PARSE ERROR: more than one meta refresh tag"
      return "ERR"
    end
    k,v = urls.first.split("=",2)
    if k.upcase.strip != "URL"
      STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
      return "ERR"
    end
    return v.strip
  else
    return nil
  end
end

NEED to move this to external configuration

list of urls we don’t care to store in our links list



66
67
68
69
# File 'lib/wwmd/page/scrape.rb', line 66

def reject_links
  putw "WARN: override reject_links in helper script" if @warn
  default_reject_links
end

#reset(page) ⇒ Object

reset this scrape object (called by WWMD::Page)



31
32
33
34
35
# File 'lib/wwmd/page/scrape.rb', line 31

def reset(page)
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
end

#urls_from_helperObject

define an urls_from_helper method in your task specific script



192
193
194
195
# File 'lib/wwmd/page/scrape.rb', line 192

def urls_from_helper
  putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
  return nil
end

#urls_from_regexp(content, re, split = 0) ⇒ Object

scan the passed string for the configured regular expressions and return them as an array



39
40
41
42
43
44
45
46
47
48
49
# File 'lib/wwmd/page/scrape.rb', line 39

def urls_from_regexp(content,re,split=0)
  ret = []
  scrape = content.scan(re)
  scrape.each do |url|
    # cheat and take split string(,)[split]
    add = url.to_s.split(',')[split].gsub(/['"]/, '')
    next if (add == '' || add.nil?)
    ret << add
  end
  return ret
end

#urls_from_xpath(xpath, attr) ⇒ Object

xpath search for tags and return the passed attribute

urls_from_xpath("//a","href")


53
54
55
56
57
58
59
60
61
# File 'lib/wwmd/page/scrape.rb', line 53

def urls_from_xpath(xpath,attr)
  ret = []
  @hdoc.search(xpath).each do |elem|
    url = elem[attr]
    next if url.empty?
    ret << url.strip
  end
  return ret
end

#warningsObject

renamed class variable (for backward compat)



187
188
189
# File 'lib/wwmd/page/scrape.rb', line 187

def warnings#:nodoc:
  return @warn
end