Class: ConnectorsSdk::Base::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/connectors_sdk/base/extractor.rb

Direct Known Subclasses

Office365::Extractor

Constant Summary collapse

MAX_CONNECTION_ATTEMPTS =
3
DEFAULT_CURSOR_KEY =
'all'.freeze
TRANSIENT_SERVER_ERROR_CLASSES =
Set.new(
  [
    Faraday::ConnectionFailed,
    Faraday::SSLError,
    Faraday::TimeoutError,
    HTTPClient::ConnectTimeoutError,
    Net::OpenTimeout
  ]
)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content_source_id:, service_type:, config:, features:, client_proc:, authorization_data_proc:, monitor: ConnectorsShared::Monitor.new(:connector => self)) ⇒ Extractor

Returns a new instance of Extractor.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/connectors_sdk/base/extractor.rb', line 36

def initialize(content_source_id:,
               service_type:,
               config:,
               features:,
               client_proc:,
               authorization_data_proc:,
               monitor: ConnectorsShared::Monitor.new(:connector => self))
  @content_source_id = content_source_id
  @service_type = service_type
  @config = config
  @features = features
  @client_proc = client_proc
  @authorization_data_proc = authorization_data_proc
  @original_cursors = config.cursors.deep_dup
  @monitor = monitor
  @completed = false
end

Instance Attribute Details

#client_procObject

Returns the value of attribute client_proc.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def client_proc
  @client_proc
end

#completedObject (readonly)

Returns the value of attribute completed.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def completed
  @completed
end

#configObject (readonly)

Returns the value of attribute config.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def config
  @config
end

#content_source_idObject (readonly)

Returns the value of attribute content_source_id.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def content_source_id
  @content_source_id
end

#featuresObject (readonly)

Returns the value of attribute features.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def features
  @features
end

#monitorObject

Returns the value of attribute monitor.



34
35
36
# File 'lib/connectors_sdk/base/extractor.rb', line 34

def monitor
  @monitor
end

#original_cursorsObject (readonly)

Returns the value of attribute original_cursors.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def original_cursors
  @original_cursors
end

#service_typeObject (readonly)

Returns the value of attribute service_type.



33
34
35
# File 'lib/connectors_sdk/base/extractor.rb', line 33

def service_type
  @service_type
end

Instance Method Details

#authorization_dataObject



59
60
61
# File 'lib/connectors_sdk/base/extractor.rb', line 59

def authorization_data
  @authorization_data ||= @authorization_data_proc.call
end

#authorization_data!Object



54
55
56
57
# File 'lib/connectors_sdk/base/extractor.rb', line 54

def authorization_data!
  @authorization_data = nil
  authorization_data
end

#clientObject



68
69
70
# File 'lib/connectors_sdk/base/extractor.rb', line 68

def client
  @client ||= client_proc.call
end

#client!Object



63
64
65
66
# File 'lib/connectors_sdk/base/extractor.rb', line 63

def client!
  @client = nil
  client
end

#convert_transient_server_errorsObject



211
212
213
214
215
216
217
218
219
220
221
# File 'lib/connectors_sdk/base/extractor.rb', line 211

def convert_transient_server_errors
  yield
rescue StandardError => e
  raise unless transient_error?(e)

  raise ConnectorsShared::TransientServerError.new(
    "Transient error #{e.class}: #{e.message}",
    :suspend_until => Connectors.config.fetch('transient_server_error_retry_delay_minutes').minutes.from_now,
    :cursors => config.cursors
  )
end

#cursors_modified_since_start?Boolean

Returns:

  • (Boolean)


231
232
233
# File 'lib/connectors_sdk/base/extractor.rb', line 231

def cursors_modified_since_start?
  config.cursors != original_cursors
end

#deleted_ids(ids, &block) ⇒ Object



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/connectors_sdk/base/extractor.rb', line 165

def deleted_ids(ids, &block)
  enum = nil
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.deleted_ids") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_deleted_ids") do
        counter = 0
        enum = Enumerator.new do |yielder|
          yield_deleted_ids(ids) do |id|
            yielder.yield id
            counter += 1
            log_info("Deleted #{counter} documents so far") if counter % 100 == 0
          end
        end
        enum.each(&block) if block_given?
      end
    end
  end
  enum
end

#document_changes(modified_since: nil, &block) ⇒ Object



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/connectors_sdk/base/extractor.rb', line 121

def document_changes(modified_since: nil, &block)
  enum = nil
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.documents") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_documents") do
        counter = 0
        enum = Enumerator.new do |yielder|
          yield_document_changes(:modified_since => modified_since) do |action, change, subextractors|
            yielder.yield action, change, subextractors
            counter += 1
            log_info("Extracted #{counter} documents so far") if counter % 100 == 0
          end
        end
        enum.each(&block) if block_given?
      end
    end
  end
  enum
end

#download_args_and_proc(id:, name:, size:, download_args:, &block) ⇒ Object



235
236
237
# File 'lib/connectors_sdk/base/extractor.rb', line 235

def download_args_and_proc(id:, name:, size:, download_args:, &block)
  [id, name, size, download_args, block]
end

#evictable?Boolean

Returns:

  • (Boolean)


227
228
229
# File 'lib/connectors_sdk/base/extractor.rb', line 227

def evictable?
  false
end

#identifying_error_message(identifier) ⇒ Object



157
158
159
# File 'lib/connectors_sdk/base/extractor.rb', line 157

def identifying_error_message(identifier)
  identifier.present? ? " of '#{identifier}'" : ''
end

#permissions(source_user_id, &block) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/connectors_sdk/base/extractor.rb', line 189

def permissions(source_user_id, &block)
  Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.permissions") do
    with_auth_tokens_and_retry do
      Connectors::Stats.measure("extractor.#{Connectors::Stats.class_key(self.class)}.yield_permissions") do
        yield_permissions(source_user_id) do |permissions|
          log_info("Extracted #{permissions.size} permissions for source user #{source_user_id}")
          block.call(permissions) if block_given?
        end
      end
    end
  end
end

#retrieve_latest_cursorsObject



72
73
74
# File 'lib/connectors_sdk/base/extractor.rb', line 72

def retrieve_latest_cursors
  nil
end

#transient_error?(error) ⇒ Boolean

Returns:

  • (Boolean)


223
224
225
# File 'lib/connectors_sdk/base/extractor.rb', line 223

def transient_error?(error)
  TRANSIENT_SERVER_ERROR_CLASSES.any? { |error_class| error.kind_of?(error_class) }
end

#with_auth_tokens_and_retry(&block) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/connectors_sdk/base/extractor.rb', line 76

def with_auth_tokens_and_retry(&block)
  connection_attempts = 0

  begin
    convert_transient_server_errors do
      convert_rate_limit_errors(&block)
    end
  rescue ConnectorsShared::TokenRefreshFailedError => e
    log_error('Could not refresh token, aborting')
    raise e
  rescue ConnectorsShared::PublishingFailedError => e
    log_error('Could not publish, aborting')
    raise e.reason
  rescue ConnectorsShared::EvictionWithNoProgressError
    log_error('Aborting job because it did not make any progress and cannot be evicted')
    raise
  rescue ConnectorsShared::EvictionError,
         ConnectorsShared::ThrottlingError,
         ConnectorsShared::JobDocumentLimitError,
         ConnectorsShared::MonitoringError,
         ConnectorsShared::JobInterruptedError,
         ConnectorsShared::SecretInvalidError,
         ConnectorsShared::InvalidIndexingConfigurationError => e
    # Don't retry eviction, throttling, document limit, or monitoring errors, let them bubble out
    raise
  rescue StandardError => e
    ConnectorsShared::ExceptionTracking.augment_exception(e)
    connection_attempts += 1
    if connection_attempts >= MAX_CONNECTION_ATTEMPTS
      log_warn("Failed to connect in with_auth_tokens_and_retry Reason: #{e.class}: #{e.message} {:message_id => #{e.id}}")
      log_warn("Retries: #{connection_attempts}/#{MAX_CONNECTION_ATTEMPTS}, giving up.")
      ConnectorsShared::ExceptionTracking.log_exception(e)
      raise e
    else
      log_warn("Failed to connect in with_auth_tokens_and_retry. Reason: #{e.class}: #{e.message} {:message_id => #{e.id}}")
      log_warn("Retries: #{connection_attempts}/#{MAX_CONNECTION_ATTEMPTS}, trying again.")
      retry
    end
  end
end

#yield_deleted_ids(_ids) ⇒ Object

Raises:

  • (NotImplementedError)


161
162
163
# File 'lib/connectors_sdk/base/extractor.rb', line 161

def yield_deleted_ids(_ids)
  raise NotImplementedError
end

#yield_document_changes(modified_since: nil) ⇒ Object

Raises:

  • (NotImplementedError)


117
118
119
# File 'lib/connectors_sdk/base/extractor.rb', line 117

def yield_document_changes(modified_since: nil)
  raise NotImplementedError
end

#yield_permissions(source_user_id) ⇒ Object



185
186
187
# File 'lib/connectors_sdk/base/extractor.rb', line 185

def yield_permissions(source_user_id)
  # no-op for content source without DLP support
end

#yield_single_document_change(identifier: nil, &block) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/connectors_sdk/base/extractor.rb', line 141

def yield_single_document_change(identifier: nil, &block)
  log_debug("Extracting single document for #{identifier}") if identifier
  convert_transient_server_errors do
    convert_rate_limit_errors(&block)
  end
  monitor.note_success
rescue *fatal_exception_classes => e
  ConnectorsShared::ExceptionTracking.augment_exception(e)
  log_error("Encountered a fall-through error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
  raise
rescue StandardError => e
  ConnectorsShared::ExceptionTracking.augment_exception(e)
  log_warn("Encountered error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
  monitor.note_error(e, :id => e.id)
end