Class: Birdwatcher::Modules::Urls::Crawl

Inherits:
Birdwatcher::Module show all
Defined in:
lib/birdwatcher/modules/urls/crawl.rb

Constant Summary collapse

PAGE_TITLE_REGEX =
/<title>(.*?)<\/title>/i

Constants inherited from Birdwatcher::Module

Birdwatcher::Module::MODULE_PATH

Constants included from Concerns::Concurrency

Concerns::Concurrency::DEFAULT_THREAD_POOL_SIZE

Constants included from Concerns::Core

Concerns::Core::DATA_DIRECTORY

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Birdwatcher::Module

_file_path, _file_path=, descendants, #execute, inherited, meta, meta=, module_by_path, module_paths, modules, path

Methods included from Concerns::Concurrency

included, #thread_pool

Methods included from Concerns::Persistence

included, #save_status, #save_user

Methods included from Concerns::Presentation

included, #make_status_summary_output, #make_url_summary_output, #make_user_details_output, #make_user_summary_output, #output_status_summary, #output_user_details, #output_user_summary, #page_text

Methods included from Concerns::Outputting

#confirm, #error, #fatal, included, #info, #line_separator, #newline, #output, #output_formatted, #task, #warn

Methods included from Concerns::Util

#escape_html, #excerpt, included, #parse_time, #pluralize, #strip_control_characters, #strip_html, #suppress_output, #suppress_warnings, #time_ago_in_words, #unescape_html

Methods included from Concerns::Core

#console, #current_workspace, #current_workspace=, #database, included, #klout_client, #read_data_file, #twitter_client

Class Method Details

.infoObject



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/birdwatcher/modules/urls/crawl.rb', line 61

def self.info
<<-INFO
The URL Crawler module crawls shared URLs and enriches them with additional
information:

  * HTTP status code (200, 404, 500, etc.)
  * Content type (application/html, application/pdf, etc)
  * Page title (if HTML document)

Page titles can be included in the Word Cloud generated with the
#{'statuses/word_cloud'.bold} module.

#{'CAUTION:'.bold} Depending on the users in the workspace, it might not be safe
to blindly request shared URLs. Consider using the #{'PROXY_ADDR'.bold} and #{'PROXY_PORT'.bold}
module options.
INFO
end

Instance Method Details

#runObject



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/birdwatcher/modules/urls/crawl.rb', line 79

def run
  if option_setting("RETRY_FAILED")
    urls = current_workspace.urls_dataset
      .where("crawled_at IS NULL or (crawled_at IS NOT NULL AND http_status IS NULL)")
      .order(Sequel.desc(:created_at))
  else
    urls = current_workspace.urls_dataset
      .where(:crawled_at => nil)
      .order(Sequel.desc(:created_at))
  end
  if urls.empty?
    error("There are currently no URLs in this workspace")
    return false
  end
  threads     = thread_pool(option_setting("THREADS").to_i)
  http_client = Birdwatcher::HttpClient.new(
    :timeout        => option_setting("TIMEOUT").to_i,
    :retries        => option_setting("RETRIES").to_i,
    :user_agent     => option_setting("USER_AGENT"),
    :http_proxyaddr => option_setting("PROXY_ADDR"),
    :http_proxyport => (option_setting("PROXY_PORT") ? option_setting("PROXY_PORT").to_i : nil),
    :http_proxyuser => option_setting("PROXY_USER"),
    :http_proxypass => option_setting("PROXY_PASS")
  )
  urls.each do |url|
    threads.process do
      begin
        Timeout::timeout(option_setting("TIMEOUT").to_i * 2) do
          response = http_client.do_head(url.url)
          url.final_url    = response.url
          url.http_status  = response.status
          url.content_type = response.headers["content-type"]
          if response.headers.key?("content-type") && response.headers["content-type"].include?("text/html")
            url.title = extract_page_title(http_client.do_get(response.url).body)
          end
          url.crawled_at = Time.now
          url.save
          info("Crawled #{url.url.bold} (#{response.status} - #{response.headers["content-type"]})")
        end
      rescue => e
        url.crawled_at = Time.now
        url.save
        error("Crawling failed for #{url.url.bold} (#{e.class})")
      end
    end
  end
  threads.shutdown
end