Class: BaseService
- Inherits:
-
Object
- Object
- BaseService
- Defined in:
- lib/services.rb
Direct Known Subclasses
Constant Summary collapse
- @@service_classes =
{}
Class Method Summary collapse
- .add_service_class(name, description, service_class, needs_auth = true, url_regex = nil) ⇒ Object
- .get_service(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ Object
- .get_service_for_url(url) ⇒ Object
- .get_services ⇒ Object
Instance Method Summary collapse
- #access_pause_sleep ⇒ Object
- #authenticate ⇒ Object
- #execute ⇒ Object
-
#fetch_url(url, output_file = nil, curl_params = "") ⇒ Object
Executes curl to fetch the requested url this method.
- #get_dest_path(url) ⇒ Object
- #get_field_value(html, field) ⇒ Object
- #get_file_header(url) ⇒ Object
- #get_path_url(url) ⇒ Object
- #get_type(url) ⇒ Object
- #get_uri(url) ⇒ Object
-
#initialize(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ BaseService
constructor
A new instance of BaseService.
- #is_excluded(url) ⇒ Object
- #is_html_url(url) ⇒ Object
- #is_pdf_url(url) ⇒ Object
- #parse_html(url, html) ⇒ Object
- #parse_html_page(url, url_cache = Set.new) ⇒ Object
- #post(url, params, output_file = nil, curl_params = "") ⇒ Object
- #process_pdf(url) ⇒ Object
- #url_chomp(url) ⇒ Object
Constructor Details
#initialize(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ BaseService
Returns a new instance of BaseService.
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/services.rb', line 23 def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {} @base_dir = `echo ~`.strip @uri_cache = {} @file_header_cache = {} @type_cache = url_type_cache @processed_pdfs = {} @name = name @log = Logger.new(STDOUT) @log.progname = name @log.level = log_level @conf = { "type" => "base", "exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"], "access_pause" => { #in seconds "min" => 0.1, "max" => 0.3 }, "pdfs" => { "src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot "dest_folder" => "abcd", "download_once" => true }, "cookie_jar" => "cookies.txt", "user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0", "entry_url" => "", "auth" => "base" #references auth conf or {"user" => "", "pass" => ""} } temp_conf = @conf.merge conf unless conf["pdfs"] == nil temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"] end @conf = temp_conf if @conf["auth"].is_a? String @conf["auth"] = auth_conf[@conf["auth"]] @log.debug "Load auth from auth config #{auth_conf}" end if @conf["pdfs"]["src_folder"].start_with? "." entry_uri = get_uri @conf["entry_url"] entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path) @conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}" @log.info "Source folder is #{@conf["pdfs"]["src_folder"]}" end src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"]) @conf["pdfs"]["src_path"] = src_url_parsed.path @conf["pdfs"]["src_host"] = src_url_parsed.host @log.info "Start authentication" authenticate @log.info "Authentication completed" end |
Class Method Details
.add_service_class(name, description, service_class, needs_auth = true, url_regex = nil) ⇒ Object
285 286 287 288 289 290 291 292 |
# File 'lib/services.rb', line 285 def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil @@service_classes[name] = { "class" => service_class, "url_regex" => url_regex, "description" => description, "needs_auth" => needs_auth } end |
.get_service(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ Object
76 77 78 79 80 81 82 83 |
# File 'lib/services.rb', line 76 def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {} service = @@service_classes[conf["type"]] if service == nil raise "Unknown service #{conf["type"]}" else service["class"].new name, conf, auth_conf, log_level, url_type_cache end end |
.get_service_for_url(url) ⇒ Object
310 311 312 313 314 315 316 317 |
# File 'lib/services.rb', line 310 def self.get_service_for_url url @@service_classes.each do |name, service| unless service["url_regex"] == nil && service["url_regex"] =~ url return name end end return "base" end |
.get_services ⇒ Object
306 307 308 |
# File 'lib/services.rb', line 306 def self.get_services @@service_classes.clone end |
Instance Method Details
#access_pause_sleep ⇒ Object
221 222 223 224 225 226 227 |
# File 'lib/services.rb', line 221 def access_pause_sleep min = @conf["access_pause"]["min"] max = @conf["access_pause"]["max"] duration = Random.rand() * (max - min) + min @log.debug "Sleep #{duration} seconds to behave a bit more human" sleep duration end |
#authenticate ⇒ Object
85 86 87 |
# File 'lib/services.rb', line 85 def authenticate "" end |
#execute ⇒ Object
89 90 91 92 93 |
# File 'lib/services.rb', line 89 def execute @log.info "Start grawling #{@conf["entry_url"]}" parse_html_page @conf["entry_url"] @log.info "Completed grawling #{@conf["entry_url"]}" end |
#fetch_url(url, output_file = nil, curl_params = "") ⇒ Object
Executes curl to fetch the requested url this method
123 124 125 126 127 128 129 |
# File 'lib/services.rb', line 123 def fetch_url url, output_file=nil, curl_params="" curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\"" curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}" @log.debug "Call curl on #{url}" @log.debug "Curl parameters '#{curl_params}'" `cd #{@base_dir}; curl #{curl_params}` end |
#get_dest_path(url) ⇒ Object
278 279 280 281 282 283 |
# File 'lib/services.rb', line 278 def get_dest_path url url_path = get_uri(url).path src_path = @conf["pdfs"]["src_path"] dest_folder = @conf["pdfs"]["dest_folder"] dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length) end |
#get_field_value(html, field) ⇒ Object
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/services.rb', line 173 def get_field_value html, field doc = nil begin doc = Nokogiri::HTML html rescue => ex @log.error "Parsing html failed" @log.error ex return "" end value = "" doc.css("##{field}").each do |link| value = link.attributes["value"].to_s end return value end |
#get_file_header(url) ⇒ Object
229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/services.rb', line 229 def get_file_header url url = url_chomp url if @file_header_cache[url] == nil header = fetch_url url, "-", "-I" lines = header.split("\r\n").map {|val| val.split(": ") } response = {} lines.each {|arr| response[arr[0]] = arr[1] } @file_header_cache[url] = response @log.info "Fetch header of #{url}" access_pause_sleep end return @file_header_cache[url] end |
#get_path_url(url) ⇒ Object
243 244 245 246 |
# File 'lib/services.rb', line 243 def get_path_url url parsed = get_uri url parsed.path + (parsed.query != "" ? "?#{parsed.query}": "") end |
#get_type(url) ⇒ Object
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# File 'lib/services.rb', line 189 def get_type url if is_excluded url return "" end if @type_cache[url] == nil if url.upcase.end_with?(".PDF") || get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf") @type_cache[url] = "pdf" elsif get_file_header(url)["Content-Type"].start_with?("text/html") @type_cache[url] = "html" else @type_cache[url] = "" end end return @type_cache[url] end |
#get_uri(url) ⇒ Object
294 295 296 297 298 299 |
# File 'lib/services.rb', line 294 def get_uri url if @uri_cache[url] == nil @uri_cache[url] = URI.parse url end return @uri_cache[url] end |
#is_excluded(url) ⇒ Object
214 215 216 217 218 219 |
# File 'lib/services.rb', line 214 def is_excluded url parsed_url = get_uri url parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) || parsed_url.host != @conf["pdfs"]["src_host"] || !parsed_url.path.start_with?(@conf["pdfs"]["src_path"]) end |
#is_html_url(url) ⇒ Object
210 211 212 |
# File 'lib/services.rb', line 210 def is_html_url url get_type(url) == "html" end |
#is_pdf_url(url) ⇒ Object
206 207 208 |
# File 'lib/services.rb', line 206 def is_pdf_url url get_type(url) == "pdf" end |
#parse_html(url, html) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/services.rb', line 146 def parse_html url, html doc = nil links = {'pdf' => [], 'html' => []} begin doc = Nokogiri::HTML html rescue => ex @log.error "Parsing html from url #{url} failed" return links end doc.css('a[href]').each do |link| begin link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s @log.debug "Process link #{link_url}" if is_pdf_url link_url links['pdf'] << link_url @log.debug "#{link_url} is pdf" elsif is_html_url link_url links['html'] << link_url @log.debug "#{link_url} is html" end rescue => ex @log.debug "Omit #{link}" end end return links end |
#parse_html_page(url, url_cache = Set.new) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/services.rb', line 95 def parse_html_page url, url_cache = Set.new url = url_chomp url return if url_cache.member?(url) url_cache.add url @log.info "Fetch and parse #{url}" html = "" begin html = fetch_url url access_pause_sleep rescue => ex @log.error "Cannot fetch #{url}" @log.error ex return end links = parse_html url, html links["html"].each do |html_link| parse_html_page html_link, url_cache end links["pdf"].each do |pdf_link| process_pdf pdf_link end end |
#post(url, params, output_file = nil, curl_params = "") ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# File 'lib/services.rb', line 131 def post url, params, output_file=nil, curl_params="" param_arr = [] params.each do |key, value| param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}" end param = param_arr.join "&" begin fetch_url url, output_file, "#{curl_params} --data \"#{param}\"" rescue => ex @log.error "Failed to POST #{url} with data #{params}" @log.error ex "" end end |
#process_pdf(url) ⇒ Object
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/services.rb', line 248 def process_pdf url url = url_chomp url return unless @processed_pdfs[url] == nil @log.info "Process pdf #{url}" dest = get_dest_path url if not @conf["pdfs"]["download_once"] header_date = get_file_header(url)["Last-Modified"] header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0 @log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}" if file_time >= header_time @log.info "Destination file #{dest} isn't younger => no download" return end elsif File.exists? dest @log.info "Destination file exists => no download" return end `mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest) @log.info "Destination file #{dest} is older => download" begin @log.debug(fetch_url url, dest) rescue => ex @log.error "Downloading #{url} failed" @log.error ex end @processed_pdfs[url] = dest access_pause_sleep end |
#url_chomp(url) ⇒ Object
301 302 303 304 |
# File 'lib/services.rb', line 301 def url_chomp url uri = get_uri url uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "") end |