Class: UrlProcessor::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/url_processor/base.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(c) ⇒ Base

Returns a new instance of Base.

Raises:

  • (ArgumentError)


5
6
7
8
9
10
11
# File 'lib/url_processor/base.rb', line 5

def initialize(c)
  raise ArgumentError unless c.is_a? UrlProcessor::Config
  @config = c

  # connect to the db
  #OnlinesearchesModels::connect
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



3
4
5
# File 'lib/url_processor/base.rb', line 3

def config
  @config
end

Instance Method Details

#find_in_batches(collection, batch_size) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/url_processor/base.rb', line 48

def find_in_batches(collection, batch_size)
  if collection.is_a? Array
    collection.each do |element|
      yield element
    end
  else
    collection.find_in_batches(batch_size: batch_size) do |group|
      # Output progress information
      config.logger.info "PROCESSED: #{processed_links}, NEXT GROUP SIZE: #{group.size}".yellow

      # for debuggin purposes we do not want to process everything
      if config.debug && processed_links >= config.batch_size
        config.logger.debug "FINISHED first batch (#{@batch_size} records), exiting".yellow
        return
      end

      group.each do |element|
        yield element
      end
    end
  end
end


13
14
15
# File 'lib/url_processor/base.rb', line 13

def new_broken_link(params={})
  raise NotImplementedError.new "new_broken_link not implemented"
end


44
45
46
# File 'lib/url_processor/base.rb', line 44

def new_link_request(url, params={})
  raise NotImplementedError.new "link_request is not implemented"
end


36
37
38
# File 'lib/url_processor/base.rb', line 36

def pre_process_link(link)
  # do nothing
end

#process_response(response) ⇒ Object



40
41
42
# File 'lib/url_processor/base.rb', line 40

def process_response(response)
  raise NotImplementedError.new "process_reponse is not implemented"
end


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/url_processor/base.rb', line 17

def report_broken_link(link_id, params={})
  url_type_code = params[:url_type_code]
  response_code = params[:response_code]
  begin
    link = config.get_link_by_id.call(link_id)
    broken_link = new_broken_link(
      :link_id => link.id, 
      :fips_code => link.fips_code, 
      :url_type_code => url_type_code, 
      :response_code => response_code,
      :reported_by => 'QC Report'
    )
    broken_link.save
    config.logger.debug "broken link created (#{broken_link.id}): #{broken_link.serializable_hash}".red
  rescue ActiveRecord::RecordNotFound => e
    config.logger.warn "#{e}".red
  end
end

#runObject



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/url_processor/base.rb', line 71

def run
  processed_links = 0
  hydra = Typhoeus::Hydra.new(max_concurrency: config.max_concurrency, max_total_connections: config.max_total_connections)

  find_in_batches(config.links.call, config.batch_size) do |link|
    # any custom pre-processing
    pre_process_link(link)

    if link.urls.empty?
      # In the event that we have a link that actually has no urls associated with it
      report_broken_link link.id, :response_code => :has_no_urls if config.report_records_without_urls
    else
      # Each record has 2 urls associated with it, process each separately
      link.urls.each do |url|
        config.logger.debug "link: #{link.serializable_hash}, url: #{url}".yellow

        link_request = config.new_link_request.call(
          url[:url], 
          followlocation: true, 
          method: :head, 
          ssl_verifypeer: false, 
          ssl_verifyhost: 2, 
          cookiefile: config.cookies_file, 
          cookiejar: config.cookies_file, 
          link_id: link.id,
          url_type_code: url[:url_type_code],
          timeout: config.max_timeout,
          connecttimeout: config.max_timeout,
          max_retries: config.max_retries,
          forbid_reuse: 1,
          nosignal: 1
        )

        link_request.on_complete do |response|
          processed_links += 1

          if ([:operation_timedout, :couldnt_resolve_host].include? response.return_code) && response.request.retry_request?
            config.logger.info "#{response.return_code} - #{response.effective_url} timed out, retrying".yellow
            hydra.queue response.request
          elsif response.return_code == :got_nothing && response.request.options[:method] != :get
            config.logger.info "#{response.return_code} - #{response.effective_url} empty response, attempting GET request instead".yellow
            
            # set to GET request since HEAD may fail in some cases
            response.request.options[:method] = :get
            hydra.queue response.request
          else
            config.process_response.call response
          end
        end

        hydra.queue link_request
      end
    end

    hydra.run
  end
end