Method: Websitary::Configuration#push_hrefs
- Defined in:
- lib/websitary/configuration.rb
#push_hrefs(url, hpricot, &condition) ⇒ Object
Scan hpricot document for hrefs and push the onto @todo if not already included.
867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 |
# File 'lib/websitary/configuration.rb', line 867 def push_hrefs(url, hpricot, &condition) begin $logger.debug "push_refs: #{url}" return if robots?(hpricot, 'nofollow') or is_excluded?(url) depth = url_get(url, :depth) return if depth and depth <= 0 uri0 = URI.parse(url) # pn0 = Pathname.new(guess_dir(File.expand_path(uri0.path))) pn0 = Pathname.new(guess_dir(uri0.path)) (hpricot / 'a').each do |a| next if a['rel'] == 'nofollow' href = clean_url(a['href']) next if href.nil? or href == url or is_excluded?(href) uri = URI.parse(href) pn = guess_dir(uri.path) href = rewrite_href(href, url, uri0, pn0, true) curl = canonic_url(href) next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl) # pn = Pathname.new(guess_dir(File.expand_path(uri.path))) uri = URI.parse(href) pn = Pathname.new(guess_dir(uri.path)) next unless condition.call(uri0, pn0, uri, pn) next unless robots_allowed?(curl, uri) opts = @urls[url].dup # opts[:title] = File.basename(curl) opts[:title] = [opts[:title], File.basename(curl)].join(' - ') opts[:depth] = depth - 1 if depth and depth >= 0 # opts[:sleep] = delay if delay url_set(curl, opts) to_do curl end rescue Exception => e # $logger.error e #DBG# $logger.error e. $logger.debug e.backtrace end end |