Class: Birdwatcher::Modules::Urls::Crawl
- Inherits:
-
Birdwatcher::Module
- Object
- Birdwatcher::Module
- Birdwatcher::Modules::Urls::Crawl
- Defined in:
- lib/birdwatcher/modules/urls/crawl.rb
Constant Summary collapse
- PAGE_TITLE_REGEX =
/<title>(.*?)<\/title>/i
Constants inherited from Birdwatcher::Module
Birdwatcher::Module::MODULE_PATH
Constants included from Concerns::Concurrency
Concerns::Concurrency::DEFAULT_THREAD_POOL_SIZE
Constants included from Concerns::Core
Concerns::Core::DATA_DIRECTORY
Class Method Summary collapse
Instance Method Summary collapse
Methods inherited from Birdwatcher::Module
_file_path, _file_path=, descendants, #execute, inherited, meta, meta=, module_by_path, module_paths, modules, path
Methods included from Concerns::Concurrency
Methods included from Concerns::Persistence
included, #save_status, #save_user
Methods included from Concerns::Presentation
included, #make_status_summary_output, #make_url_summary_output, #make_user_details_output, #make_user_summary_output, #output_status_summary, #output_user_details, #output_user_summary, #page_text
Methods included from Concerns::Outputting
#confirm, #error, #fatal, included, #info, #line_separator, #newline, #output, #output_formatted, #task, #warn
Methods included from Concerns::Util
#escape_html, #excerpt, included, #parse_time, #pluralize, #strip_control_characters, #strip_html, #suppress_output, #suppress_warnings, #time_ago_in_words, #unescape_html
Methods included from Concerns::Core
#console, #current_workspace, #current_workspace=, #database, included, #klout_client, #read_data_file, #twitter_client
Class Method Details
.info ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/birdwatcher/modules/urls/crawl.rb', line 61 def self.info <<-INFO The URL Crawler module crawls shared URLs and enriches them with additional information: * HTTP status code (200, 404, 500, etc.) * Content type (application/html, application/pdf, etc) * Page title (if HTML document) Page titles can be included in the Word Cloud generated with the #{'statuses/word_cloud'.bold} module. #{'CAUTION:'.bold} Depending on the users in the workspace, it might not be safe to blindly request shared URLs. Consider using the #{'PROXY_ADDR'.bold} and #{'PROXY_PORT'.bold} module options. INFO end |
Instance Method Details
#run ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/birdwatcher/modules/urls/crawl.rb', line 79 def run if option_setting("RETRY_FAILED") urls = current_workspace.urls_dataset .where("crawled_at IS NULL or (crawled_at IS NOT NULL AND http_status IS NULL)") .order(Sequel.desc(:created_at)) else urls = current_workspace.urls_dataset .where(:crawled_at => nil) .order(Sequel.desc(:created_at)) end if urls.empty? error("There are currently no URLs in this workspace") return false end threads = thread_pool(option_setting("THREADS").to_i) http_client = Birdwatcher::HttpClient.new( :timeout => option_setting("TIMEOUT").to_i, :retries => option_setting("RETRIES").to_i, :user_agent => option_setting("USER_AGENT"), :http_proxyaddr => option_setting("PROXY_ADDR"), :http_proxyport => (option_setting("PROXY_PORT") ? option_setting("PROXY_PORT").to_i : nil), :http_proxyuser => option_setting("PROXY_USER"), :http_proxypass => option_setting("PROXY_PASS") ) urls.each do |url| threads.process do begin Timeout::timeout(option_setting("TIMEOUT").to_i * 2) do response = http_client.do_head(url.url) url.final_url = response.url url.http_status = response.status url.content_type = response.headers["content-type"] if response.headers.key?("content-type") && response.headers["content-type"].include?("text/html") url.title = extract_page_title(http_client.do_get(response.url).body) end url.crawled_at = Time.now url.save info("Crawled #{url.url.bold} (#{response.status} - #{response.headers["content-type"]})") end rescue => e url.crawled_at = Time.now url.save error("Crawling failed for #{url.url.bold} (#{e.class})") end end end threads.shutdown end |