Class: Fetcher::Worker

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/fetcher/worker.rb

Instance Method Summary collapse

Constructor Details

#initialize(old_logger_do_not_use = nil) ⇒ Worker

todo/fix: remove logger from c’tor

use logutils instead


13
14
15
16
17
18
19
20
21
# File 'lib/fetcher/worker.rb', line 13

def initialize( old_logger_do_not_use=nil )
  if old_logger_do_not_use != nil
     puts "*** depreciated API call [Fetcher.initialize] - do NOT pass in logger; no longer required/needed; logger arg will get removed"
  end
  
  ### cache for conditional get (e.g. etags and last-modified headers/checks)
  @cache = {}
  @use_cache = false
end

Instance Method Details

#cacheObject



26
# File 'lib/fetcher/worker.rb', line 26

def cache()                    @cache;                   end

#clear_cacheObject

note: use cache[ uri ] = hash for headers+plus body+plus code(410,etc.)

cache[ uri ]


25
# File 'lib/fetcher/worker.rb', line 25

def clear_cache()              @cache = {};              end

#copy(src, dest) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/fetcher/worker.rb', line 52

def copy( src, dest )
  ### fix: return true - success or
  #               false - error!!!

  ## todo: add file protocol - why? why not??

  logger.debug "fetch - copy src: #{src} to dest: #{dest}"

  response = get_response( src )

  # on error return; do NOT copy file; sorry
  return  if response.code != '200'

  # check for content type; use 'wb' for images
  if response.content_type =~ /image/
    logger.debug '  switching to binary'
    flags = 'wb'
  else
    flags = 'w'
  end
  
  File.open( dest, flags ) do |f|
    f.write( response.body )
  end
end

#get(src) ⇒ Object



31
32
33
34
35
36
# File 'lib/fetcher/worker.rb', line 31

def get( src )
  # return HTTPResponse (code,message,body,etc.)
  logger.debug "fetch - get(_response) src: #{src}"

  get_response( src )
end

#get_response(src) ⇒ Object

todo: add file protocol



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/fetcher/worker.rb', line 81

def get_response( src )
  uri = URI.parse( src )
  
  # new code: honor proxy env variable HTTP_PROXY
  proxy = ENV['HTTP_PROXY']
  proxy = ENV['http_proxy'] if proxy.nil?   # try possible lower/case env variable (for *nix systems) is this necessary??

  if proxy
    proxy = URI.parse( proxy )
    logger.debug "using net http proxy: proxy.host=#{proxy.host}, proxy.port=#{proxy.port}"
    if proxy.user && proxy.password
      logger.debug "  using credentials: proxy.user=#{proxy.user}, proxy.password=****"
    else
      logger.debug "  using no credentials"
    end
  else
    logger.debug "using direct net http access; no proxy configured"
    proxy = OpenStruct.new   # all fields return nil (e.g. proxy.host, etc.)
  end
  
  http_proxy = Net::HTTP::Proxy( proxy.host, proxy.port, proxy.user, proxy.password )

  redirect_limit = 4
  response = nil

  until false
    raise ArgumentError, 'HTTP redirect too deep' if redirect_limit == 0
    redirect_limit -= 1
  
    http = http_proxy.new( uri.host, uri.port )

    logger.debug "GET #{uri.request_uri} uri=#{uri}, redirect_limit=#{redirect_limit}"

    headers = { 'User-Agent' => "fetcher gem v#{VERSION}" }

    if use_cache?
      ## check for existing cache entry in cache store (lookup by uri)
      ## todo/fix: normalize uri!!!! - how?
      ##  - remove query_string ?? fragement ?? why? why not??
      
      ## note:  using uri.to_s  should return full uri e.g. http://example.com/page.html


      cache_entry = cache[ uri.to_s ]
      if cache_entry
        logger.info "found cache entry for >#{uri.to_s}<"
        if cache_entry['etag']
          logger.info "adding header If-None-Match (etag) >#{cache_entry['etag']}< for conditional GET"
          headers['If-None-Match'] = cache_entry['etag']
        end
        if cache_entry['last-modified']
          logger.info "adding header If-Modified-Since (last-modified) >#{cache_entry['last-modified']}< for conditional GET"
          headers['If-Modified-Since'] = cache_entry['last-modified']
        end
      end
    end

    request = Net::HTTP::Get.new( uri.request_uri, headers )
    if uri.instance_of? URI::HTTPS
      http.use_ssl = true
      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
    end

    response   = http.request( request )  

    if response.code == '200'
      logger.debug "#{response.code} #{response.message}"
      logger.debug "  content_type: #{response.content_type}, content_length: #{response.content_length}"
      break  # will return response
    elsif( response.code == '304' ) # -- Not Modified - for conditional GETs (using etag,last-modified)
      logger.debug "#{response.code} #{response.message}"
      break  # will return response
    elsif( response.code == '301' || response.code == '302' || response.code == '303' || response.code == '307' )
      # 301 = moved permanently
      # 302 = found
      # 303 = see other
      # 307 = temporary redirect
      logger.debug "#{response.code} #{response.message} location=#{response.header['location']}"
      newuri = URI.parse( response.header['location'] )
      if newuri.relative?
        logger.debug "url relative; try to make it absolute"
        newuri = uri + response.header['location']
      end
      uri = newuri
    else
      puts "*** error - fetch HTTP - #{response.code} #{response.message}"
      break  # will return response
    end
  end

  response
end

#read(src) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
# File 'lib/fetcher/worker.rb', line 39

def read( src )
  # return contents (response body) a string
  logger.debug "fetch - copy src: #{src} into string"
  
  response = get_response( src )
  
  # on error return empty string; - check: better return nil- why? why not??
  return ''  if response.code != '200'

  response.body.dup  # return string copy
end

#use_cache=(true_or_false) ⇒ Object

true|false



27
# File 'lib/fetcher/worker.rb', line 27

def use_cache=(true_or_false)  @use_cache=true_or_false; end

#use_cache?Boolean

Returns:

  • (Boolean)


28
# File 'lib/fetcher/worker.rb', line 28

def use_cache?()               @use_cache;               end