Class: WWMD::Scrape
- Inherits:
-
Object
- Object
- WWMD::Scrape
- Defined in:
- lib/wwmd/page/scrape.rb
Instance Attribute Summary collapse
-
#debug ⇒ Object
Returns the value of attribute debug.
-
#hdoc ⇒ Object
readonly
Returns the value of attribute hdoc.
-
#jlinks ⇒ Object
links to javascript includes.
-
#links ⇒ Object
links found on page.
-
#warn ⇒ Object
Returns the value of attribute warn.
Instance Method Summary collapse
-
#default_reject_links ⇒ Object
default reject links (override using reject_links in helper script).
-
#for_comments ⇒ Object
scan page for comment fields.
-
#for_forms ⇒ Object
return an array of Form objects for forms on page.
-
#for_javascript_links ⇒ Object
scrape the page for <script src=“”> tags.
-
#for_javascript_redirect ⇒ Object
scrape the page for a script tag that contains a bare location.href tag (to redirect the page).
-
#for_links(reject = true) ⇒ Object
use xpath searches to get * //a href * //area href * //frame src * //iframe src * //form action * //meta refresh content urls then get //script tags and regexp out links in javascript function calls from elem.inner_html.
-
#for_meta_refresh ⇒ Object
scrape the page for a meta refresh tag and return the url from the contents attribute or nil.
-
#initialize(page = '<>') ⇒ Scrape
constructor
create a new scrape object using passed HTML.
-
#reject_links ⇒ Object
NEED to move this to external configuration.
-
#reset(page) ⇒ Object
reset this scrape object (called by WWMD::Page).
-
#urls_from_helper ⇒ Object
define an urls_from_helper method in your task specific script.
-
#urls_from_regexp(content, re, split = 0) ⇒ Object
scan the passed string for the configured regular expressions and return them as an array.
-
#urls_from_xpath(xpath, attr) ⇒ Object
xpath search for tags and return the passed attribute urls_from_xpath(“//a”,“href”).
-
#warnings ⇒ Object
renamed class variable (for backward compat).
Constructor Details
Instance Attribute Details
#debug ⇒ Object
Returns the value of attribute debug.
15 16 17 |
# File 'lib/wwmd/page/scrape.rb', line 15 def debug @debug end |
#hdoc ⇒ Object (readonly)
Returns the value of attribute hdoc.
19 20 21 |
# File 'lib/wwmd/page/scrape.rb', line 19 def hdoc @hdoc end |
#jlinks ⇒ Object
links to javascript includes
18 19 20 |
# File 'lib/wwmd/page/scrape.rb', line 18 def jlinks @jlinks end |
#links ⇒ Object
links found on page
17 18 19 |
# File 'lib/wwmd/page/scrape.rb', line 17 def links @links end |
#warn ⇒ Object
Returns the value of attribute warn.
16 17 18 |
# File 'lib/wwmd/page/scrape.rb', line 16 def warn @warn end |
Instance Method Details
#default_reject_links ⇒ Object
default reject links (override using reject_links in helper script)
72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/wwmd/page/scrape.rb', line 72 def default_reject_links @links.reject! do |url| url.nil? || url.extname == ".css" || url.extname == ".pdf" || url =~ /javascript:/i || url =~ /mailto:/i || url =~ /[\[\]]/ || url =~ /^#/ end end |
#for_comments ⇒ Object
scan page for comment fields
148 149 150 |
# File 'lib/wwmd/page/scrape.rb', line 148 def for_comments @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s } end |
#for_forms ⇒ Object
return an array of Form objects for forms on page
85 86 87 88 89 |
# File 'lib/wwmd/page/scrape.rb', line 85 def for_forms ret = [] @hdoc.search("//form").each { |f| ret << Form.new(f) } ret end |
#for_javascript_links ⇒ Object
scrape the page for <script src=“”> tags
140 141 142 143 144 145 |
# File 'lib/wwmd/page/scrape.rb', line 140 def for_javascript_links urls = [] @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] } urls.reject! { |url| File.extname(url).clip != ".js" } return urls end |
#for_javascript_redirect ⇒ Object
scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/wwmd/page/scrape.rb', line 173 def for_javascript_redirect redirs = [] @hdoc.search("//script").each do |scr| scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x } end if redirs.size > 1 STDERR.puts "PARSE ERROR: more than one javascript redirect" return "ERR" end return redirs.first if not redirs.empty? return nil end |
#for_links(reject = true) ⇒ Object
use xpath searches to get
-
//a href
-
//area href
-
//frame src
-
//iframe src
-
//form action
-
//meta refresh content urls
then get //script tags and regexp out links in javascript function calls from elem.inner_html
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/wwmd/page/scrape.rb', line 100 def for_links(reject=true) self.urls_from_xpath("//a","href").each { |url| @links << url }; # get <a href=""> elements self.urls_from_xpath("//area","href").each { |url| @links << url }; # get <area href=""> elements self.urls_from_xpath("//frame","src").each { |url| @links << url }; # get <frame src=""> elements self.urls_from_xpath("//iframe","src").each { |url| @links << url }; # get <iframe src=""> elements self.urls_from_xpath("//form","action").each { |url| @links << url }; # get <form action=""> elements # <meta> refresh @hdoc.search("//meta").each do || next if ['http-equiv'] != "refresh" next if not (content = ['content'].split(/=/)[1]) @links << content.strip end # add urls from onclick handlers @hdoc.search("*[@onclick]").each do |onclick| LINKS_REGEXP.each do |re| self.urls_from_regexp(onclick['onclick'],re).each do |url| @links << url end end end # add urls_from_regexp (limit to <script> tags (elem.inner_html)) @hdoc.search("//script").each do |scr| LINKS_REGEXP.each do |re| self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url } end end # re-define urls_from_helper in what you mix in begin self.urls_from_helper end self.reject_links; # reject links we don't care about return @links end |
#for_meta_refresh ⇒ Object
scrape the page for a meta refresh tag and return the url from the contents attribute or nil
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/wwmd/page/scrape.rb', line 153 def has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh') if has_mr urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] } if urls.size > 1 STDERR.puts "PARSE ERROR: more than one meta refresh tag" return "ERR" end k,v = urls.first.split("=",2) if k.upcase.strip != "URL" STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url" return "ERR" end return v.strip else return nil end end |
#reject_links ⇒ Object
NEED to move this to external configuration
list of urls we don’t care to store in our links list
66 67 68 69 |
# File 'lib/wwmd/page/scrape.rb', line 66 def reject_links putw "WARN: override reject_links in helper script" if @warn default_reject_links end |
#reset(page) ⇒ Object
reset this scrape object (called by WWMD::Page)
31 32 33 34 35 |
# File 'lib/wwmd/page/scrape.rb', line 31 def reset(page) @page = page @hdoc = HDOC.parse(@page) @links = Array.new end |
#urls_from_helper ⇒ Object
define an urls_from_helper method in your task specific script
192 193 194 195 |
# File 'lib/wwmd/page/scrape.rb', line 192 def urls_from_helper putw "WARN: Please set an urls_from_helper override in your helper script" if @warn return nil end |
#urls_from_regexp(content, re, split = 0) ⇒ Object
scan the passed string for the configured regular expressions and return them as an array
39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/wwmd/page/scrape.rb', line 39 def urls_from_regexp(content,re,split=0) ret = [] scrape = content.scan(re) scrape.each do |url| # cheat and take split string(,)[split] add = url.to_s.split(',')[split].gsub(/['"]/, '') next if (add == '' || add.nil?) ret << add end return ret end |
#urls_from_xpath(xpath, attr) ⇒ Object
xpath search for tags and return the passed attribute
urls_from_xpath("//a","href")
53 54 55 56 57 58 59 60 61 |
# File 'lib/wwmd/page/scrape.rb', line 53 def urls_from_xpath(xpath,attr) ret = [] @hdoc.search(xpath).each do |elem| url = elem[attr] next if url.empty? ret << url.strip end return ret end |
#warnings ⇒ Object
renamed class variable (for backward compat)
187 188 189 |
# File 'lib/wwmd/page/scrape.rb', line 187 def warnings#:nodoc: return @warn end |