Method: Websitary::Configuration#push_hrefs

Defined in:
lib/websitary/configuration.rb

#push_hrefs(url, hpricot, &condition) ⇒ Object

Scan hpricot document for hrefs and push the onto @todo if not already included.



867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
# File 'lib/websitary/configuration.rb', line 867

def push_hrefs(url, hpricot, &condition)
    begin
        $logger.debug "push_refs: #{url}"
        return if robots?(hpricot, 'nofollow') or is_excluded?(url)
        depth = url_get(url, :depth)
        return if depth and depth <= 0
        uri0  = URI.parse(url)
        # pn0   = Pathname.new(guess_dir(File.expand_path(uri0.path)))
        pn0   = Pathname.new(guess_dir(uri0.path))
        (hpricot / 'a').each do |a|
            next if a['rel'] == 'nofollow'
            href = clean_url(a['href'])
            next if href.nil? or href == url or is_excluded?(href)
            uri  = URI.parse(href)
            pn   = guess_dir(uri.path)
            href = rewrite_href(href, url, uri0, pn0, true)
            curl = canonic_url(href)
            next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
            # pn   = Pathname.new(guess_dir(File.expand_path(uri.path)))
            uri  = URI.parse(href)
            pn   = Pathname.new(guess_dir(uri.path))
            next unless condition.call(uri0, pn0, uri, pn)
            next unless robots_allowed?(curl, uri)
            opts = @urls[url].dup
            # opts[:title] = File.basename(curl)
            opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
            opts[:depth] = depth - 1 if depth and depth >= 0
            # opts[:sleep] = delay if delay
            url_set(curl, opts)
            to_do curl
        end
    rescue Exception => e
        # $logger.error e  #DBG#
        $logger.error e.message
        $logger.debug e.backtrace
    end
end