Class: UrlProcessor::Base
- Inherits:
-
Object
- Object
- UrlProcessor::Base
- Defined in:
- lib/url_processor/base.rb
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
Instance Method Summary collapse
- #find_in_batches(collection, batch_size) ⇒ Object
-
#initialize(c) ⇒ Base
constructor
A new instance of Base.
- #new_broken_link(params = {}) ⇒ Object
- #new_link_request(url, params = {}) ⇒ Object
- #pre_process_link(link) ⇒ Object
- #process_response(response) ⇒ Object
- #report_broken_link(link_id, params = {}) ⇒ Object
- #run ⇒ Object
Constructor Details
#initialize(c) ⇒ Base
Returns a new instance of Base.
23 24 25 26 27 28 29 |
# File 'lib/url_processor/base.rb', line 23 def initialize(c) raise ArgumentError unless c.is_a? UrlProcessor::Config @config = c # connect to the db #OnlinesearchesModels::connect end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
21 22 23 |
# File 'lib/url_processor/base.rb', line 21 def config @config end |
Instance Method Details
#find_in_batches(collection, batch_size) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/url_processor/base.rb', line 66 def find_in_batches(collection, batch_size) if collection.respond_to? :find_in_batches collection.find_in_batches(batch_size: batch_size) do |group| # Output progress information config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow yield group # for debuggin purposes we do not want to process everything if config.debug && processed_links >= config.batch_size config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow return end end else elements = [] collection.each do |element| elements << element if elements.size % batch_size == 0 yield elements elements = elements.clear end end # done iterating, yield whatever else we have left, if we have stuff left if elements.size > 0 yield elements end end end |
#new_broken_link(params = {}) ⇒ Object
31 32 33 |
# File 'lib/url_processor/base.rb', line 31 def new_broken_link(params={}) raise NotImplementedError.new "new_broken_link not implemented" end |
#new_link_request(url, params = {}) ⇒ Object
62 63 64 |
# File 'lib/url_processor/base.rb', line 62 def new_link_request(url, params={}) raise NotImplementedError.new "link_request is not implemented" end |
#pre_process_link(link) ⇒ Object
54 55 56 |
# File 'lib/url_processor/base.rb', line 54 def pre_process_link(link) # do nothing end |
#process_response(response) ⇒ Object
58 59 60 |
# File 'lib/url_processor/base.rb', line 58 def process_response(response) raise NotImplementedError.new "process_reponse is not implemented" end |
#report_broken_link(link_id, params = {}) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/url_processor/base.rb', line 35 def report_broken_link(link_id, params={}) link_data = params[:link_data] response_code = params[:response_code] begin link = config.get_link_by_id.call(link_id) broken_link = new_broken_link( :link_id => link.id, :fips_code => link.fips_code, :link_data => link_data, :response_code => response_code, :reported_by => 'QC Report' ) broken_link.save config.logger.debug "broken link created (#{broken_link.id}): #{broken_link.serializable_hash}".red rescue ActiveRecord::RecordNotFound => e config.logger.warn "#{e}".red end end |
#run ⇒ Object
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/url_processor/base.rb', line 97 def run processed_links = 0 # use an in-memory cache of responses (per run) cache = Cache.new Typhoeus::Config.cache = cache hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections) find_in_batches(config.links.call, config.batch_size) do |group| group.each do |link| # any custom pre-processing pre_process_link(link) if link.urls.empty? # In the event that we have a link that actually has no urls associated with it report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls else # Each record has 2 urls associated with it, process each separately link.urls.each do |url| config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow link_request = config.new_link_request.call( url[:url], followlocation: true, method: :head, ssl_verifypeer: false, ssl_verifyhost: 2, cookiefile: config., cookiejar: config., link_id: link.id, link_data: url[:link_data], timeout: config.max_timeout, connecttimeout: config.max_timeout, max_retries: config.max_retries, forbid_reuse: 1, nosignal: 1 ) link_request.on_complete do |response| processed_links += 1 if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request? config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow hydra.queue response.request elsif response.return_code == :got_nothing && response.request.[:method] != :get config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow # set to GET request since HEAD may fail in some cases response.request.[:method] = :get hydra.queue response.request else config.process_response.call response end end hydra.queue link_request end end end hydra.run end cache.empty! end |