Class: Webhookdb::Replicator::TransistorEpisodeV1
- Includes:
- Appydays::Loggable, TransistorV1Mixin
- Defined in:
- lib/webhookdb/replicator/transistor_episode_v1.rb
Constant Summary collapse
- BLOCK_ELEMENT_TAGS =
["p", "div"].freeze
Constants included from DBAdapter::ColumnTypes
DBAdapter::ColumnTypes::BIGINT, DBAdapter::ColumnTypes::BIGINT_ARRAY, DBAdapter::ColumnTypes::BOOLEAN, DBAdapter::ColumnTypes::COLUMN_TYPES, DBAdapter::ColumnTypes::DATE, DBAdapter::ColumnTypes::DECIMAL, DBAdapter::ColumnTypes::DOUBLE, DBAdapter::ColumnTypes::FLOAT, DBAdapter::ColumnTypes::INTEGER, DBAdapter::ColumnTypes::INTEGER_ARRAY, DBAdapter::ColumnTypes::OBJECT, DBAdapter::ColumnTypes::TEXT, DBAdapter::ColumnTypes::TEXT_ARRAY, DBAdapter::ColumnTypes::TIMESTAMP, DBAdapter::ColumnTypes::UUID
Constants inherited from Base
Instance Attribute Summary
Attributes inherited from Base
Class Method Summary collapse
Instance Method Summary collapse
- #_denormalized_columns ⇒ Object
-
#_extract_first_html_line_as_text(element) ⇒ Object
Usually the Transistor HTML looks like <div>foo
hello</div>. - #_fetch_backfill_page(pagination_token, last_backfilled:) ⇒ Object
- #_fetch_enrichment(resource) ⇒ Object
- #_prepare_for_insert(resource, event, request, enrichment) ⇒ Object
- #parse_date_from_api(date_string) ⇒ Object
- #upsert_has_deps? ⇒ Boolean
Methods included from TransistorV1Mixin
#_remote_key_column, #_resource_and_event, #_timestamp_column_name, #_update_where_expr, #_verify_backfill_401_err_msg, #_verify_backfill_err_msg, #_webhook_response, #calculate_backfill_state_machine
Methods inherited from Base
#_any_subscriptions_to_notify?, #_backfill_state_change_fields, #_backfillers, #_clear_backfill_information, #_clear_webook_information, #_coalesce_excluded_on_update, #_enqueue_backfill_jobs, #_extra_index_specs, #_find_dependency_candidate, #_notify_dependents, #_parallel_backfill, #_publish_rowupsert, #_remote_key_column, #_resource_and_event, #_resource_to_data, #_store_enrichment_body?, #_timestamp_column_name, #_to_json, #_update_where_expr, #_upsert_conflict_target, #_upsert_update_expr, #_upsert_webhook, #_upsert_webhook_single_resource, #_verify_backfill_err_msg, #_webhook_response, #_webhook_state_change_fields, #admin_dataset, #avoid_writes?, #backfill, #backfill_not_supported_message, #calculate_and_backfill_state_machine, #calculate_backfill_state_machine, #calculate_dependency_state_machine_step, #calculate_preferred_create_state_machine, #calculate_webhook_state_machine, chunked_row_update_bounds, #clear_backfill_information, #clear_webhook_information, #create_table, #create_table_modification, #create_table_partitions, #data_column, #dbadapter_table, #denormalized_columns, #descriptor, #dispatch_request_to, #documentation_url, #enqueue_sync_targets, #enrichment_column, #ensure_all_columns, #ensure_all_columns_modification, #existing_partitions, #find_dependent, #find_dependent!, #indices, #initialize, #on_backfill_error, #on_dependency_webhook_upsert, #partition?, #partitioning, #preferred_create_state_machine_method, #preprocess_headers_for_logging, #primary_key_column, #process_state_change, #process_webhooks_synchronously?, #qualified_table_sequel_identifier, #readonly_dataset, #remote_key_column, #requires_sequence?, #resource_name_plural, #resource_name_singular, #schema_and_table_symbols, #storable_columns, #synchronous_processing_response_body, #timestamp_column, #upsert_webhook, #upsert_webhook_body, #verify_backfill_credentials, #webhook_endpoint, #webhook_response, #with_advisory_lock
Constructor Details
This class inherits a constructor from Webhookdb::Replicator::Base
Class Method Details
.descriptor ⇒ Webhookdb::Replicator::Descriptor
12 13 14 15 16 17 18 19 20 21 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 12 def self.descriptor return Webhookdb::Replicator::Descriptor.new( name: "transistor_episode_v1", ctor: ->(sint) { Webhookdb::Replicator::TransistorEpisodeV1.new(sint) }, feature_roles: [], resource_name_singular: "Transistor Episode", supports_backfill: true, api_docs_url: "https://developers.transistor.fm/#Episode", ) end |
Instance Method Details
#_denormalized_columns ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 23 def _denormalized_columns return [ Webhookdb::Replicator::Column.new(:author, TEXT, data_key: ["attributes", "author"]), Webhookdb::Replicator::Column.new( :created_at, TIMESTAMP, index: true, data_key: ["attributes", "created_at"], ), Webhookdb::Replicator::Column.new(:duration, INTEGER, data_key: ["attributes", "duration"]), Webhookdb::Replicator::Column.new(:keywords, TEXT, data_key: ["attributes", "keywords"]), Webhookdb::Replicator::Column.new(:number, INTEGER, index: true, data_key: ["attributes", "number"]), Webhookdb::Replicator::Column.new( :published_at, TIMESTAMP, index: true, data_key: ["attributes", "published_at"], ), Webhookdb::Replicator::Column.new(:season, INTEGER, index: true, data_key: ["attributes", "season"]), Webhookdb::Replicator::Column.new( :show_id, TEXT, index: true, data_key: ["relationships", "show", "data", "id"], ), Webhookdb::Replicator::Column.new(:status, TEXT, data_key: ["attributes", "status"]), Webhookdb::Replicator::Column.new(:title, TEXT, data_key: ["attributes", "title"]), Webhookdb::Replicator::Column.new(:type, TEXT, data_key: ["attributes", "type"]), Webhookdb::Replicator::Column.new( :updated_at, TIMESTAMP, index: true, data_key: ["attributes", "updated_at"], ), Webhookdb::Replicator::Column.new(:transcript_text, TEXT, optional: true), # Ideally these would have converters, but they'd be very confusing, and when this was built # we only had one transistor user, so we truncated the table instead. Webhookdb::Replicator::Column.new(:api_format, INTEGER, optional: true), Webhookdb::Replicator::Column.new(:logical_summary, TEXT, optional: true), Webhookdb::Replicator::Column.new(:logical_description, TEXT, optional: true), ] end |
#_extract_first_html_line_as_text(element) ⇒ Object
Usually the Transistor HTML looks like <div>foo
hello</div>. Extract ‘foo’ as text, remove leading
, and return <div>hello</div>.
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 105 def _extract_first_html_line_as_text(element) # Grab the first div or p element, where the text is. first_div = element.css("div, p").first return nil unless first_div # Iterate over each child element: # - If it's a text element, it's part of the first line. # - If it's a br/div/p element, we have reached the end of the first line. # - Otherwise, it's probably some type of style element, and can be appended. first_line_html = +"" first_div.children.to_a.each do |child| if child.is_a?(Nokogiri::XML::Text) first_line_html << child.inner_text child.remove elsif child.name == "br" # Remove additional br tags, this is like # removing leading whitespace of the new/remaining description. while (sibling = child.next) break unless sibling.name == "br" sibling.remove end child.remove break elsif BLOCK_ELEMENT_TAGS.include?(child.name) break else first_line_html << child.to_html child.remove end end first_div.remove if first_div.inner_text.blank? return first_line_html.strip end |
#_fetch_backfill_page(pagination_token, last_backfilled:) ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 167 def _fetch_backfill_page(pagination_token, last_backfilled:) url = "https://api.transistor.fm/v1/episodes" pagination_token = 1 if pagination_token.blank? response = Webhookdb::Http.get( url, headers: {"x-api-key" => self.service_integration.backfill_key}, body: {pagination: {page: pagination_token, per: 500}}, logger: self.logger, timeout: Webhookdb::Transistor.http_timeout, ) data = response.parsed_response episodes = data["data"] current_page = data["meta"]["currentPage"] total_pages = data["meta"]["totalPages"] next_page = (current_page.to_i + 1 if current_page < total_pages) if last_backfilled.present? earliest_data_created = episodes.empty? ? Time.at(0) : episodes[-1].dig("attributes", "created_at") paged_to_already_seen_records = earliest_data_created < last_backfilled return episodes, nil if paged_to_already_seen_records end return episodes, next_page end |
#_fetch_enrichment(resource) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 140 def _fetch_enrichment(resource, *) transcript_url = resource.fetch("attributes").fetch("transcript_url", nil) return nil if transcript_url.blank? (transcript_url += ".txt") unless transcript_url.end_with?(".txt") begin resp = Webhookdb::Http.get( transcript_url, logger: self.logger, timeout: Webhookdb::Transistor.http_timeout, ) rescue Webhookdb::Http::Error => e # Not sure why this happens, but nothing we can do if it does. return nil if e.status == 404 raise e end transcript_text = resp.body return {transcript_text:} end |
#_prepare_for_insert(resource, event, request, enrichment) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 68 def _prepare_for_insert(resource, event, request, enrichment) h = super # Transistor merged their summary and description fields so they're authored # as one big 'description' HTML blob in February 2023. Previous to that, # there were separate summary and description fields # (we call this api_format 1). # # If we have a nil summary, we know this is a 'new' format (api_format 2). # In that case, look for the first line of the HTML, # and treat that as the summary. Anything else in the HTML is treated as # the remaining description. Some care is paid to whitespace, too, # since <br> tags can be used within an element. summary = resource.fetch("attributes").fetch("summary", nil) description = resource.fetch("attributes").fetch("description", nil) if summary.nil? h[:api_format] = 2 parsed_desc = Nokogiri::HTML5.fragment(description) extracted_summary = self._extract_first_html_line_as_text(parsed_desc) h[:logical_description] = nil if extracted_summary h[:logical_summary] = extracted_summary h[:logical_description] = parsed_desc.to_s.strip if parsed_desc.inner_text.present? else h[:logical_summary] = parsed_desc.to_s.strip end else h[:logical_summary] = summary h[:logical_description] = description h[:api_format] = 1 end h.merge!(enrichment) if enrichment return h end |
#parse_date_from_api(date_string) ⇒ Object
163 164 165 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 163 def parse_date_from_api(date_string) return Time.strptime(date_string, "%d-%m-%Y") end |
#upsert_has_deps? ⇒ Boolean
159 160 161 |
# File 'lib/webhookdb/replicator/transistor_episode_v1.rb', line 159 def upsert_has_deps? return true end |