Class: ConnectorsSdk::Base::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/connectors_sdk/base/extractor.rb

Constant Summary collapse

MAX_CONNECTION_ATTEMPTS =
3
DEFAULT_CURSOR_KEY =
'all'.freeze
TRANSIENT_SERVER_ERROR_CLASSES =
Set.new(
  [
    Faraday::ConnectionFailed,
    Faraday::SSLError,
    Faraday::TimeoutError,
    HTTPClient::ConnectTimeoutError,
    Net::OpenTimeout
  ]
)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content_source_id:, service_type:, config:, features:, client_proc:, authorization_data_proc:, monitor: ConnectorsShared::Monitor.new(:connector => self)) ⇒ Extractor

Returns a new instance of Extractor.



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/connectors_sdk/base/extractor.rb', line 37

def initialize(content_source_id:,
               service_type:,
               config:,
               features:,
               client_proc:,
               authorization_data_proc:,
               monitor: ConnectorsShared::Monitor.new(:connector => self))
  @content_source_id = content_source_id
  @service_type = service_type
  @config = config
  @features = features
  @client_proc = client_proc
  @authorization_data_proc = authorization_data_proc
  @original_cursors = config.cursors.deep_dup
  @monitor = monitor
  @completed = false
end

Instance Attribute Details

#client_procObject

Returns the value of attribute client_proc.



35
36
37
# File 'lib/connectors_sdk/base/extractor.rb', line 35

def client_proc
  @client_proc
end

#completedObject (readonly)

Returns the value of attribute completed.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def completed
  @completed
end

#configObject (readonly)

Returns the value of attribute config.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def config
  @config
end

#content_source_idObject (readonly)

Returns the value of attribute content_source_id.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def content_source_id
  @content_source_id
end

#featuresObject (readonly)

Returns the value of attribute features.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def features
  @features
end

#monitorObject

Returns the value of attribute monitor.



35
36
37
# File 'lib/connectors_sdk/base/extractor.rb', line 35

def monitor
  @monitor
end

#original_cursorsObject (readonly)

Returns the value of attribute original_cursors.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def original_cursors
  @original_cursors
end

#service_typeObject (readonly)

Returns the value of attribute service_type.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def service_type
  @service_type
end

Instance Method Details

#authorization_dataObject



60
61
62
# File 'lib/connectors_sdk/base/extractor.rb', line 60

def authorization_data
  @authorization_data ||= @authorization_data_proc.call
end

#authorization_data!Object



55
56
57
58
# File 'lib/connectors_sdk/base/extractor.rb', line 55

def authorization_data!
  @authorization_data = nil
  authorization_data
end

#clientObject



69
70
71
# File 'lib/connectors_sdk/base/extractor.rb', line 69

def client
  @client ||= client_proc.call
end

#client!Object



64
65
66
67
# File 'lib/connectors_sdk/base/extractor.rb', line 64

def client!
  @client = nil
  client
end

#convert_transient_server_errorsObject



215
216
217
218
219
220
221
222
223
224
225
# File 'lib/connectors_sdk/base/extractor.rb', line 215

def convert_transient_server_errors
  yield
rescue StandardError => e
  raise unless transient_error?(e)

  raise ConnectorsShared::TransientServerError.new(
    "Transient error #{e.class}: #{e.message}",
    :suspend_until => Connectors.config.fetch('transient_server_error_retry_delay_minutes').minutes.from_now,
    :cursors => config.cursors
  )
end

#cursors_modified_since_start?Boolean

Returns:

  • (Boolean)


235
236
237
# File 'lib/connectors_sdk/base/extractor.rb', line 235

def cursors_modified_since_start?
  config.cursors != original_cursors
end

#deleted_ids(ids, &block) ⇒ Object



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/connectors_sdk/base/extractor.rb', line 166

def deleted_ids(ids, &block)
  enum = nil
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.deleted_ids") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_deleted_ids") do
        counter = 0
        enum = Enumerator.new do |yielder|
          yield_deleted_ids(ids) do |id|
            yielder.yield id
            counter += 1
            log_info("Deleted #{counter} documents so far") if counter % 100 == 0
          end
        end
        enum.each(&block) if block_given?
      end
    end
  end
  enum
end

#document_changes(modified_since: nil, &block) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/connectors_sdk/base/extractor.rb', line 122

def document_changes(modified_since: nil, &block)
  enum = nil
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.documents") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_documents") do
        counter = 0
        enum = Enumerator.new do |yielder|
          yield_document_changes(:modified_since => modified_since) do |action, change, subextractors|
            yielder.yield action, change, subextractors
            counter += 1
            log_info("Extracted #{counter} documents so far") if counter % 100 == 0
          end
        end
        enum.each(&block) if block_given?
      end
    end
  end
  enum
end

#download_args_and_proc(id:, name:, size:, download_args:, &block) ⇒ Object



239
240
241
# File 'lib/connectors_sdk/base/extractor.rb', line 239

def download_args_and_proc(id:, name:, size:, download_args:, &block)
  [id, name, size, download_args, block]
end

#evictable?Boolean

Returns:

  • (Boolean)


231
232
233
# File 'lib/connectors_sdk/base/extractor.rb', line 231

def evictable?
  false
end

#identifying_error_message(identifier) ⇒ Object



158
159
160
# File 'lib/connectors_sdk/base/extractor.rb', line 158

def identifying_error_message(identifier)
  identifier.present? ? " of '#{identifier}'" : ''
end

#permissions(source_user_id, &block) ⇒ Object



190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/connectors_sdk/base/extractor.rb', line 190

def permissions(source_user_id, &block)
  result = []
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.permissions") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_permissions") do
        yield_permissions(source_user_id) do |permissions|
          log_info("Extracted #{permissions.size} permissions for source user #{source_user_id}")
          result = permissions
          block.call(permissions) if block_given?
        end
      end
    end
  end
  result.each
end

#retrieve_latest_cursorsObject



73
74
75
# File 'lib/connectors_sdk/base/extractor.rb', line 73

def retrieve_latest_cursors
  nil
end

#transient_error?(error) ⇒ Boolean

Returns:

  • (Boolean)


227
228
229
# File 'lib/connectors_sdk/base/extractor.rb', line 227

def transient_error?(error)
  TRANSIENT_SERVER_ERROR_CLASSES.any? { |error_class| error.kind_of?(error_class) }
end

#with_auth_tokens_and_retry(&block) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/connectors_sdk/base/extractor.rb', line 77

def with_auth_tokens_and_retry(&block)
  connection_attempts = 0

  begin
    convert_transient_server_errors do
      convert_rate_limit_errors(&block)
    end
  rescue ConnectorsShared::TokenRefreshFailedError => e
    log_error('Could not refresh token, aborting')
    raise e
  rescue ConnectorsShared::PublishingFailedError => e
    log_error('Could not publish, aborting')
    raise e.reason
  rescue ConnectorsShared::EvictionWithNoProgressError
    log_error('Aborting job because it did not make any progress and cannot be evicted')
    raise
  rescue ConnectorsShared::EvictionError,
         ConnectorsShared::ThrottlingError,
         ConnectorsShared::JobDocumentLimitError,
         ConnectorsShared::MonitoringError,
         ConnectorsShared::JobInterruptedError,
         ConnectorsShared::SecretInvalidError,
         ConnectorsShared::InvalidIndexingConfigurationError => e
    # Don't retry eviction, throttling, document limit, or monitoring errors, let them bubble out
    raise
  rescue StandardError => e
    ConnectorsShared::ExceptionTracking.augment_exception(e)
    connection_attempts += 1
    if connection_attempts >= MAX_CONNECTION_ATTEMPTS
      log_warn("Failed to connect in with_auth_tokens_and_retry Reason: #{e.class}: #{e.message} {:message_id => #{e.id}}")
      log_warn("Retries: #{connection_attempts}/#{MAX_CONNECTION_ATTEMPTS}, giving up.")
      ConnectorsShared::ExceptionTracking.log_exception(e)
      raise e
    else
      log_warn("Failed to connect in with_auth_tokens_and_retry. Reason: #{e.class}: #{e.message} {:message_id => #{e.id}}")
      log_warn("Retries: #{connection_attempts}/#{MAX_CONNECTION_ATTEMPTS}, trying again.")
      retry
    end
  end
end

#yield_deleted_ids(_ids) ⇒ Object

Raises:

  • (NotImplementedError)


162
163
164
# File 'lib/connectors_sdk/base/extractor.rb', line 162

def yield_deleted_ids(_ids)
  raise NotImplementedError
end

#yield_document_changes(modified_since: nil) ⇒ Object

Raises:

  • (NotImplementedError)


118
119
120
# File 'lib/connectors_sdk/base/extractor.rb', line 118

def yield_document_changes(modified_since: nil)
  raise NotImplementedError
end

#yield_permissions(source_user_id) ⇒ Object



186
187
188
# File 'lib/connectors_sdk/base/extractor.rb', line 186

def yield_permissions(source_user_id)
  # no-op for content source without DLP support
end

#yield_single_document_change(identifier: nil, &block) ⇒ Object



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/connectors_sdk/base/extractor.rb', line 142

def yield_single_document_change(identifier: nil, &block)
  log_debug("Extracting single document for #{identifier}") if identifier
  convert_transient_server_errors do
    convert_rate_limit_errors(&block)
  end
  monitor.note_success
rescue *fatal_exception_classes => e
  ConnectorsShared::ExceptionTracking.augment_exception(e)
  log_error("Encountered a fall-through error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
  raise
rescue StandardError => e
  ConnectorsShared::ExceptionTracking.augment_exception(e)
  log_warn("Encountered error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
  monitor.note_error(e, :id => e.id)
end