Class: BaseService

Inherits:
Object
  • Object
show all
Defined in:
lib/services.rb

Direct Known Subclasses

SecuredService

Constant Summary collapse

@@service_classes =
{}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ BaseService

Returns a new instance of BaseService.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/services.rb', line 23

def initialize name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}
  @base_dir = `echo ~`.strip
  @uri_cache = {}
  @file_header_cache = {}
  @type_cache = url_type_cache
  @processed_pdfs = {}
  @name = name
  @log = Logger.new(STDOUT)
  @log.progname = name
  @log.level = log_level
  
  @conf = {
    "type" => "base",
    "exclude_file_endings" => [".css", ".js", ".txt", ".rss", ".atom"],
    "access_pause" => { #in seconds
      "min" => 0.1,
      "max" => 0.3
    },
    "pdfs" => {
      "src_folder" => "abc.de/a", #is relative to entry_url base dir if starts with dot
      "dest_folder" => "abcd", 
          "download_once" => true
    },
    "cookie_jar" => "cookies.txt",
    "user_agent" => "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0",
    "entry_url" => "",
    "auth" => "base" #references auth conf or {"user" => "", "pass" => ""}
  }
  temp_conf = @conf.merge conf
  unless conf["pdfs"] == nil
    temp_conf["pdfs"] = @conf["pdfs"].merge conf["pdfs"]
  end
  @conf = temp_conf
  if @conf["auth"].is_a? String
    @conf["auth"] = auth_conf[@conf["auth"]]
    @log.debug "Load auth from auth config #{auth_conf}"
  end
  
  if @conf["pdfs"]["src_folder"].start_with? "."
    entry_uri = get_uri @conf["entry_url"]
    entry_path_url = entry_uri.scheme + "://" + entry_uri.host + File.dirname(entry_uri.path)
    @conf["pdfs"]["src_folder"] = "#{entry_path_url}/#{@conf["pdfs"]["src_folder"]}"
    @log.info "Source folder is #{@conf["pdfs"]["src_folder"]}"
  end
  
  src_url_parsed = URI.parse(@conf["pdfs"]["src_folder"])
  @conf["pdfs"]["src_path"] = src_url_parsed.path
  @conf["pdfs"]["src_host"] = src_url_parsed.host
  @log.info "Start authentication"
  authenticate
  @log.info "Authentication completed"
end

Class Method Details

.add_service_class(name, description, service_class, needs_auth = true, url_regex = nil) ⇒ Object



285
286
287
288
289
290
291
292
# File 'lib/services.rb', line 285

def self.add_service_class name, description, service_class, needs_auth = true, url_regex = nil
  @@service_classes[name] = { 
    "class" => service_class,
    "url_regex" => url_regex,
    "description" => description,
    "needs_auth" => needs_auth
  }
end

.get_service(name, conf, auth_conf = {}, log_level = Logger::WARN, url_type_cache = {}) ⇒ Object



76
77
78
79
80
81
82
83
# File 'lib/services.rb', line 76

def self.get_service name, conf, auth_conf={}, log_level = Logger::WARN, url_type_cache = {}
  service = @@service_classes[conf["type"]]
  if service == nil
    raise "Unknown service #{conf["type"]}"
  else
    service["class"].new name, conf, auth_conf, log_level, url_type_cache
  end
end

.get_service_for_url(url) ⇒ Object



310
311
312
313
314
315
316
317
# File 'lib/services.rb', line 310

def self.get_service_for_url url
  @@service_classes.each do |name, service|
    unless service["url_regex"] == nil && service["url_regex"] =~ url
      return name 
    end
  end
  return "base"
end

.get_servicesObject



306
307
308
# File 'lib/services.rb', line 306

def self.get_services
  @@service_classes.clone
end

Instance Method Details

#access_pause_sleepObject



221
222
223
224
225
226
227
# File 'lib/services.rb', line 221

def access_pause_sleep
  min = @conf["access_pause"]["min"]
  max = @conf["access_pause"]["max"]
  duration = Random.rand() * (max - min) + min
  @log.debug "Sleep #{duration} seconds to behave a bit more human"
  sleep duration
end

#authenticateObject



85
86
87
# File 'lib/services.rb', line 85

def authenticate
  ""
end

#executeObject



89
90
91
92
93
# File 'lib/services.rb', line 89

def execute
  @log.info "Start grawling #{@conf["entry_url"]}"
  parse_html_page @conf["entry_url"]
  @log.info "Completed grawling #{@conf["entry_url"]}"
end

#fetch_url(url, output_file = nil, curl_params = "") ⇒ Object

Executes curl to fetch the requested url this method

Parameters:

  • url

    requested url

  • output_file (defaults to: nil)

    output destination, if nil the output gets returned by



123
124
125
126
127
128
129
# File 'lib/services.rb', line 123

def fetch_url url, output_file=nil, curl_params=""
  curl_params = "#{@auth_app} #{curl_params} --silent --user-agent \"#{@conf["user_agent"]}\""
  curl_params += " -b #{@conf["cookie_jar"]} -c #{@conf["cookie_jar"]} -L -o \"#{output_file || "-"}\" #{url}"
  @log.debug "Call curl on #{url}"
  @log.debug "Curl parameters '#{curl_params}'"
  `cd #{@base_dir}; curl #{curl_params}`
end

#get_dest_path(url) ⇒ Object



278
279
280
281
282
283
# File 'lib/services.rb', line 278

def get_dest_path url
  url_path = get_uri(url).path
  src_path = @conf["pdfs"]["src_path"]
  dest_folder = @conf["pdfs"]["dest_folder"]
  dest_folder + "/" + url_path.slice(src_path.length, url_path.length - src_path.length)
end

#get_field_value(html, field) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/services.rb', line 173

def get_field_value html, field
  doc = nil
  begin
    doc = Nokogiri::HTML html
  rescue => ex
    @log.error "Parsing html failed"
    @log.error ex
    return ""
  end
  value = ""
  doc.css("##{field}").each do |link|
    value = link.attributes["value"].to_s
  end
  return value
end

#get_file_header(url) ⇒ Object



229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/services.rb', line 229

def get_file_header url
  url = url_chomp url
  if @file_header_cache[url] == nil
    header = fetch_url url, "-", "-I"
    lines = header.split("\r\n").map {|val| val.split(": ") }
    response = {}
    lines.each {|arr| response[arr[0]] = arr[1] }
    @file_header_cache[url] = response
    @log.info "Fetch header of #{url}"
    access_pause_sleep
  end
  return @file_header_cache[url]
end

#get_path_url(url) ⇒ Object



243
244
245
246
# File 'lib/services.rb', line 243

def get_path_url url
  parsed = get_uri url
  parsed.path + (parsed.query != "" ? "?#{parsed.query}": "")
end

#get_type(url) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/services.rb', line 189

def get_type url
  if is_excluded url
    return ""
  end
  if @type_cache[url] == nil
    if url.upcase.end_with?(".PDF") || 
      get_file_header(url)["Content-Type"].start_with?("application/pdf", "application/x-pdf")
      @type_cache[url] = "pdf"
    elsif get_file_header(url)["Content-Type"].start_with?("text/html")
      @type_cache[url] = "html"
    else
      @type_cache[url] = ""
    end
  end
  return @type_cache[url]
end

#get_uri(url) ⇒ Object



294
295
296
297
298
299
# File 'lib/services.rb', line 294

def get_uri url
  if @uri_cache[url] == nil
    @uri_cache[url] = URI.parse url
  end
  return @uri_cache[url]
end

#is_excluded(url) ⇒ Object



214
215
216
217
218
219
# File 'lib/services.rb', line 214

def is_excluded url
  parsed_url = get_uri url
  parsed_url.path.send(:start_with?, @conf["exclude_file_endings"]) ||
      parsed_url.host != @conf["pdfs"]["src_host"] ||
      !parsed_url.path.start_with?(@conf["pdfs"]["src_path"])
end

#is_html_url(url) ⇒ Object



210
211
212
# File 'lib/services.rb', line 210

def is_html_url url
  get_type(url) == "html" 
end

#is_pdf_url(url) ⇒ Object



206
207
208
# File 'lib/services.rb', line 206

def is_pdf_url url
  get_type(url) == "pdf"
end

#parse_html(url, html) ⇒ Object



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/services.rb', line 146

def parse_html url, html
  doc = nil
  links = {'pdf' => [], 'html' => []}
  begin
    doc = Nokogiri::HTML html
  rescue => ex
    @log.error "Parsing html from url #{url} failed"
    return links
  end
  doc.css('a[href]').each do |link|
    begin
      link_url = url_chomp(URI.join(url, link.attributes["href"]).to_s).to_s
      @log.debug "Process link #{link_url}"
      if is_pdf_url link_url
        links['pdf'] << link_url 
        @log.debug "#{link_url} is pdf"
      elsif is_html_url link_url
        links['html'] << link_url
        @log.debug "#{link_url} is html"
      end
    rescue => ex
      @log.debug "Omit #{link}"
    end
  end
  return links
end

#parse_html_page(url, url_cache = Set.new) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/services.rb', line 95

def parse_html_page url, url_cache = Set.new
  url = url_chomp url
  return if url_cache.member?(url)
  url_cache.add url
  @log.info "Fetch and parse #{url}"
  html = ""
  begin
    html = fetch_url url
    access_pause_sleep
  rescue => ex
    @log.error "Cannot fetch #{url}"
    @log.error ex
    return
  end
  links = parse_html url, html
  links["html"].each do |html_link|
    parse_html_page html_link, url_cache
  end
  links["pdf"].each do |pdf_link|
    process_pdf pdf_link
  end
end

#post(url, params, output_file = nil, curl_params = "") ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/services.rb', line 131

def post url, params, output_file=nil, curl_params=""
  param_arr = []
  params.each do |key, value|
    param_arr << "#{CGI::escape(key)}=#{CGI::escape(value)}"
  end
  param = param_arr.join "&"
  begin
    fetch_url url, output_file, "#{curl_params} --data \"#{param}\""
  rescue => ex
    @log.error "Failed to POST #{url} with data #{params}"
    @log.error ex
    ""
  end
end

#process_pdf(url) ⇒ Object



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/services.rb', line 248

def process_pdf url
  url = url_chomp url
  return unless @processed_pdfs[url] == nil
  @log.info "Process pdf #{url}"
  dest = get_dest_path url
  if not @conf["pdfs"]["download_once"]
    header_date = get_file_header(url)["Last-Modified"]
    header_time = header_date != nil ? Time.parse(header_date).to_i : Time.now.to_i
    file_time = File.exists?(dest) ? File.mtime(dest).to_i : 0
    @log.info "Process pdf #{url} with mtime #{header_time}, file mtime #{file_time}"
    if file_time >= header_time
      @log.info "Destination file #{dest} isn't younger => no download"
      return
    end
  elsif File.exists? dest
    @log.info "Destination file exists => no download"
    return
  end
  `mkdir -p "#{File.dirname(dest)}"` unless File.exists? File.dirname(dest)
  @log.info "Destination file #{dest} is older => download"
  begin
    @log.debug(fetch_url url, dest)
  rescue => ex
    @log.error "Downloading #{url} failed"
    @log.error ex
  end
  @processed_pdfs[url] = dest
  access_pause_sleep
end

#url_chomp(url) ⇒ Object



301
302
303
304
# File 'lib/services.rb', line 301

def url_chomp url
  uri = get_uri url
  uri.scheme + "://" + uri.host + uri.path + (uri.query != nil ? "?#{uri.query}" : "")
end