Class: CanvasLinkMigrator::LinkParser

Inherits:
Object
  • Object
show all
Defined in:
lib/canvas_link_migrator/link_parser.rb

Constant Summary collapse

REFERENCE_KEYWORDS =
%w[CANVAS_COURSE_REFERENCE CANVAS_OBJECT_REFERENCE WIKI_REFERENCE IMS_CC_FILEBASE IMS-CC-FILEBASE].freeze
"LINK.PLACEHOLDER"
KNOWN_REFERENCE_TYPES =
%w[
  announcements
  appointment_participants
  assignment_groups
  assignments
  attachments
  calendar_events
  context_external_tools
  context_module_tags
  context_modules
  course_paces
  created_learning_outcomes
  discussion_entries
  discussion_topics
  external_feeds
  grading_standards
  groups
  learning_outcome_groups
  learning_outcome_links
  learning_outcomes
  linked_learning_outcomes
  media_attachments_iframe
  modules
  pages
  quizzes
  rubrics
  wiki
  wiki_pages
].freeze
CONTAINER_TYPES =
%w[div p body].freeze
%w[rel href src srcset data value longdesc data-download-url].freeze
RCE_MEDIA_TYPES =
%w[audio video].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(migration_query_service) ⇒ LinkParser

Returns a new instance of LinkParser.



63
64
65
66
# File 'lib/canvas_link_migrator/link_parser.rb', line 63

def initialize(migration_query_service)
  @migration_query_service = migration_query_service
  reset!
end

Instance Attribute Details

#migration_query_serviceObject (readonly)

Returns the value of attribute migration_query_service.



61
62
63
# File 'lib/canvas_link_migrator/link_parser.rb', line 61

def migration_query_service
  @migration_query_service
end

Returns the value of attribute unresolved_link_map.



61
62
63
# File 'lib/canvas_link_migrator/link_parser.rb', line 61

def unresolved_link_map
  @unresolved_link_map
end

Instance Method Details



72
73
74
75
76
77
# File 'lib/canvas_link_migrator/link_parser.rb', line 72

def add_unresolved_link(link, item_type, mig_id, field)
  key = { type: item_type, migration_id: mig_id }
  @unresolved_link_map[key] ||= {}
  @unresolved_link_map[key][field] ||= []
  @unresolved_link_map[key][field] << link
end

#convert(html, item_type, mig_id, field, remove_outer_nodes_if_one_child: nil) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/canvas_link_migrator/link_parser.rb', line 83

def convert(html, item_type, mig_id, field, remove_outer_nodes_if_one_child: nil)
  mig_id = mig_id.to_s
  doc = Nokogiri::HTML5(html || "")

  # Replace source tags with iframes
  doc.search("source[data-media-id]").each do |source|
    next unless RCE_MEDIA_TYPES.include?(source.parent.name)

    media_node = source.parent
    media_node.name = "iframe"
    media_node["src"] = source["src"]
    source.remove
  end

  doc.search("*").each do |node|
    LINK_ATTRS.each do |attr|
      convert_link(node, attr, item_type, mig_id, field)
    end
  end

  node = doc.at_css("body")
  return "" unless node

  if remove_outer_nodes_if_one_child
    while node.children.size == 1 && node.child.child
      break unless CONTAINER_TYPES.member?(node.child.name) && node.child.attributes.blank?

      node = node.child
    end
  end

  node.inner_html
rescue Nokogiri::SyntaxError
  ""
end


119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/canvas_link_migrator/link_parser.rb', line 119

def convert_link(node, attr, item_type, mig_id, field)
  return unless node[attr].present?

  if attr == "value" &&
     !(node[attr] =~ /IMS(?:-|_)CC(?:-|_)FILEBASE/ || node[attr].include?("CANVAS_COURSE_REFERENCE"))
    return
  end

  url = node[attr].dup
  REFERENCE_KEYWORDS.each do |ref|
    url.gsub!("%24#{ref}%24", "$#{ref}$")
  end

  begin
    result = parse_url(url, node, attr)
  rescue Addressable::URI::InvalidURIError
    return
  end

  if result[:resolved]
    # resolved, just replace and carry on
    new_url = result[:new_url] || url
    unless CanvasLinkMigrator.relative_url?(new_url)
      # perform configured substitutions
      if (processed_url = @migration_query_service.process_domain_substitutions(new_url))
        new_url = processed_url
      end
      # relative-ize absolute links outside the course but inside our domain
      # (analogous to what is done in Api#process_incoming_html_content)
      begin
        uri = URI.parse(new_url)
         = @migration_query_service.context_hosts.map { |h| h.split(":").first }
        if .include?(uri.host)
          uri.scheme = uri.host = uri.port = nil
          new_url = uri.to_s
        end
      rescue URI::InvalidURIError, URI::InvalidComponentError
        nil
      end
    end
    node[attr] = new_url
  else
    result.delete(:resolved)
    if result[:link_type] == :media_object
      # because we may actually change the media comment node itself
      # (rather than just replacing a value), we're going to
      # replace the entire node with a placeholder
      result[:old_value] = node.to_xml
      result[:placeholder] = placeholder(result[:old_value])
      placeholder_node = Nokogiri::HTML5.fragment(result[:placeholder])

      node.replace(placeholder_node)
    else
      result[:old_value] = node[attr]
      result[:placeholder] = placeholder(result[:old_value])
      # replace the inner html of an anchor tag if it matches the href
      if node.name == "a" && attr == "href" && node["href"] == node.inner_html.delete("\n").strip
        node.inner_html = result[:placeholder]
      end
      node[attr] = result[:placeholder]
    end
    add_unresolved_link(result, item_type, mig_id, field)
  end
end

#parse_url(url, node, attr) ⇒ Object

returns a hash with resolution status and data to hold onto if unresolved



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/canvas_link_migrator/link_parser.rb', line 193

def parse_url(url, node, attr)
  parsed_url = Addressable::URI.parse(url)
  query_values = parsed_url.query_values
  media_attachment = query_values.try(:delete, "media_attachment") == "true"
  if media_attachment
    parsed_url.query_values = query_values.presence || nil
    url = Addressable::URI.unencode(parsed_url)
  end

  if url =~ /wiki_page_migration_id=(.*)/
    unresolved(:wiki_page, migration_id: $1)
  elsif url =~ /discussion_topic_migration_id=(.*)/
    unresolved(:discussion_topic, migration_id: $1)
  elsif url =~ %r{\$CANVAS_COURSE_REFERENCE\$/modules/items/([^?]*)(\?.*)?}
    unresolved(:module_item, migration_id: $1, query: $2)
  elsif url =~ %r{\$CANVAS_COURSE_REFERENCE\$/file_ref/([^/?#]+)(.*)}
    unresolved(:file_ref,
               migration_id: $1,
               rest: $2,
               in_media_iframe: attr == "src" && ["iframe", "source"].include?(node.name) && node["data-media-id"],
               media_attachment: media_attachment)
  elsif url =~ %r{(?:\$CANVAS_OBJECT_REFERENCE\$|\$WIKI_REFERENCE\$)/([^/]*)/([^?]*)(\?.*)?}
    if KNOWN_REFERENCE_TYPES.include?($1)
      unresolved(:object, type: $1, migration_id: $2, query: $3)
    else
      # If the `type` is not known, there's something amiss...
      @migration_query_service.report_link_parse_warning($1)
      resolved(url)
    end
  elsif url =~ %r{\$CANVAS_COURSE_REFERENCE\$/(.*)}
    resolved("#{@migration_query_service.context_path}/#{$1}")

  elsif url =~ %r{\$IMS(?:-|_)CC(?:-|_)FILEBASE\$/(.*)}
    rel_path = URI::DEFAULT_PARSER.unescape($1)
    if (attr == "href" && node["class"]&.include?("instructure_inline_media_comment")) ||
       (attr == "src" && ["iframe", "source"].include?(node.name) && node["data-media-id"])
      unresolved(:media_object, rel_path: rel_path, media_attachment: media_attachment)
    else
      unresolved(:file, rel_path: rel_path)
    end
  elsif (attr == "href" && node["class"]&.include?("instructure_inline_media_comment")) ||
        (attr == "src" && ["iframe", "source"].include?(node.name) && node["data-media-id"])
    # Course copy media reference, leave it alone
    resolved
  elsif @migration_query_service.supports_embedded_images && attr == "src" && (info_match = url.match(%r{\Adata:(?<mime_type>[-\w]+/[-\w+.]+)?;base64,(?<image>.*)}m))
    result = @migration_query_service.link_embedded_image(info_match)
    if result[:resolved]
      resolved(result[:url])
    else
      unresolved(:file, rel_path: result[:url])
    end
  elsif # rubocop:disable Lint/DuplicateBranch
        # Equation image, leave it alone
        (attr == "src" && node["class"] && node["class"].include?("equation_image")) || # rubocop:disable Layout/ConditionPosition
        # The file is in the context of an AQ, leave the link alone
        url =~ %r{\A/assessment_questions/\d+/files/\d+} ||
        # This points to a specific file already, leave it alone
        url =~ %r{\A/courses/\d+/files/\d+} ||
        !@migration_query_service.fix_relative_urls? ||
        # It's just a link to an anchor, leave it alone
        url.start_with?("#")
    resolved
  elsif CanvasLinkMigrator.relative_url?(url)
    unresolved(:file, rel_path: URI::DEFAULT_PARSER.unescape(url))
  else # rubocop:disable Lint/DuplicateBranch
    resolved
  end
end

#placeholder(old_value) ⇒ Object



79
80
81
# File 'lib/canvas_link_migrator/link_parser.rb', line 79

def placeholder(old_value)
  "#{LINK_PLACEHOLDER}_#{Digest::MD5.hexdigest(old_value)}"
end

#reset!Object



68
69
70
# File 'lib/canvas_link_migrator/link_parser.rb', line 68

def reset!
  @unresolved_link_map = {}
end

#resolved(new_url = nil) ⇒ Object



188
189
190
# File 'lib/canvas_link_migrator/link_parser.rb', line 188

def resolved(new_url = nil)
  { resolved: true, new_url: new_url}
end

#unresolved(type, data = {}) ⇒ Object



184
185
186
# File 'lib/canvas_link_migrator/link_parser.rb', line 184

def unresolved(type, data = {})
  { resolved: false, link_type: type }.merge(data)
end