Class: HarvesterTools::MetadataHarvester

Inherits:
Object
  • Object
show all
Defined in:
lib/metadata_harvester.rb

Class Method Summary collapse

Class Method Details

.abbreviate_type(contenttype:) ⇒ Object



229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/metadata_harvester.rb', line 229

def self.abbreviate_type(contenttype:)
  foundtype = nil
  FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals|
    warn "\n\ntype #{type}\nvals #{vals}\n\n"
    @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}"
    next unless vals.include? contenttype

    foundtype = type
    @meta.comments << "INFO: detected a #{type} MIME type"
    break
  end
  foundtype
end

.attempt_to_detect_type(body:, headers:) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/metadata_harvester.rb', line 95

def self.attempt_to_detect_type(body:, headers:)
  #  described by should be an html, xml, json, or linked data document
  abbreviation = nil
  content_type = nil
  @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n'
  claimed_type = headers[:content_type]
  claimed_type.gsub!(/\s*;.*/, '')
  if body =~ /^\s*<\?xml/
    if body[0..1000] =~ /<HTML/i  # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
      abbreviation = 'html'
      content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
      @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
      content_type |= 'text/html'
      @meta.comments << 'INFO: appears to be HTML\n'
    elsif body =~ /<rdf:RDF/i
      abbreviation = 'rdfxml'
      content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
      @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
      content_type |= 'application/rdf+xml'
      @meta.comments << 'INFO: appears to be RDF-XML\n'
    else
      abbreviation = 'xml'
      content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
      @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
      content_type |= 'application/xml'
      @meta.comments << 'INFO: appears to be XML\n'
    end
  elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents)
    abbreviation = 'html'
    content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type)
    @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type
    content_type ||= 'text/html'
    @meta.comments << 'INFO: appears to be HTML\n'
  else
    abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type)
    abbreviation, content_type = check_json(body: body) unless abbreviation  # don't test if LD already found!
  end

  unless content_type
    @meta.add_warning(['017', url, header])
    @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
  end
  [abbreviation, content_type]
end

.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/metadata_harvester.rb', line 79

def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER)
  @meta.comments << "INFO:  link #{link.href} being processed"
  if link.respond_to? 'type'
    header = { 'Accept' => link.type }
  else
    @meta.comments << "INFO:  link #{link.href} has no MIME type, defaulting to */*"
  end
  url = link.href
  response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header)
  unless response
    @meta.add_warning(['016', url, header])
    @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n"
  end
  response
end

.check_json(body:) ⇒ Object



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/metadata_harvester.rb', line 211

def self.check_json(body:)
  abbreviation = nil
  parsed = nil
  begin
    parsed = JSON.parse(body.force_encoding('UTF-8'))
  rescue StandardError
    abbreviation = nil
  end

  if parsed
    abbreviation = 'json'
  else
    @meta.comments << "INFO: metadata does not appear to be in JSON format.  No options left.\n"
    return [nil, nil]
  end
  [abbreviation, 'application/json']
end

.check_ld(body:, claimed_type:) ⇒ Object



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/metadata_harvester.rb', line 159

def self.check_ld(body:, claimed_type:)
  detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records
  unless detected_type  # see if distiller can detect a type
    detected_type = RDF::Format.for({ sample: body[0..5000].force_encoding('UTF-8')})
    @meta.comments << "INFO: Auto-detected type #{detected_type}\n"
  end
  # at this point, detected_type is something like RDF::Turtle::Format (or nil).  This will return a content-type
  contenttype = ''
  abbreviation = ''
  if detected_type
    detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y]

    case
    when claimed_type =~ /application\/vnd\./  # vnd are domain specific
      contenttype = claimed_type  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
      abbreviation = abbreviate_type(contenttype: contenttype)
      @meta.comments << "INFO: using content-type #{contenttype}.\n"
    when detectedcontenttypes.include?(claimed_type) 
      warn "detected types #{detectedcontenttypes}  claimed type #{claimed_type}"
      @meta.add_warning(['022', @meta.all_uris.last, "" ]) 
      contenttype = detected_type.content_type.first  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
      abbreviation = abbreviate_type(contenttype: contenttype)
      @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n"
    else
      contenttype = claimed_type  # just pick one arbitrarily, since it doesn't match thedeclared type anyway
      abbreviation = abbreviate_type(contenttype: contenttype)
      @meta.comments << "INFO: using content-type #{contenttype}.\n"
    end
  else
    @meta.comments << "INFO: metadata does not appear to be in a linked data format.  Trying other options.\n"
  end
  [abbreviation, contenttype]
end

.extract_metadata_from_body(response:, metadata: HarvesterTools::MetadataObject.new) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/metadata_harvester.rb', line 38

def self.(response:, metadata: HarvesterTools::MetadataObject.new)
  @meta = 
  @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'

  abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
  unless abbreviation
    @meta.add_warning(['017', response.request.url, ''])
    @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n"
    return
  end
  request_content_types = response.request.headers["Accept"].split(/,\s*/)
  unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406)
    @meta.add_warning(['023', response.request.url, ''])
    @meta.comments << "WARN: format returned from #{response.request.url} does not match request type.  This should result in a 406 error, but instead was accepted as a 200.\n"
  end
  process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta,
                            abbreviation: abbreviation, content_type: content_type)
end


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/metadata_harvester.rb', line 8

def self.(links: [], metadata: HarvesterTools::MetadataObject.new)
  @meta = 
  @meta.comments << 'INFO:  now collecting both linked data and hash-style data using the harvested links'

  describedby = links.select { |l| l if l.relation == 'describedby' }
  warn "metadata harvester links length #{describedby.length}"

  hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs
  describedby.each do |link|
    accepttype = FspHarvester::ACCEPT_STAR_HEADER
    accept = link.respond_to?('type') ? link.type : nil
    accept.gsub!('json+ld', 'ld+json')  # patch for bug in Dataverse 5.14 linksets
    accepttype = { 'Accept' => accept } if accept

    response = attempt_to_resolve(link: link, headers: accepttype)
    warn "\n\nRESPONSE #{response}\n\n"

    abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers)
    warn "ABBR #{abbreviation} CONT #{content_type}\n\n"
    unless abbreviation
      @meta.add_warning(['017', url, header])
      @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized.  Processing will end now.\n"
      next
    end

    process_according_to_type(body: response.body, uri: link, metadata: @meta, abbreviation: abbreviation,
                              content_type: content_type, harvester: hvst)
  end
end

.ntriples_hack(body:) ⇒ Object

distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec… so hack it!



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/metadata_harvester.rb', line 193

def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it!
  detected_type = nil
  body.split.each do |line|
    line.strip!
    next if line.empty?

    next unless line =~ /\s*<[^>]+>\s*<[^>]+>\s\S+/

    @meta.comments << "INFO: running ntriples hack on  #{line + ' .'}\n"
    detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller
    break
  end
  @meta.comments << "INFO: ntriples hack found: #{detected_type}\n"
  return nil if detected_type != RDF::NTriples::Format # only return the hacky case

  detected_type
end

.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:, harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta)) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/metadata_harvester.rb', line 57

def self.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:,
                               harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta))
  warn "PROCESSING #{abbreviation}"
  case abbreviation
  when 'html'
    @meta.comments << 'INFO: Processing html'
    harvester.process_html(body: body, uri: uri, metadata: @meta)
  when 'xml'
    @meta.comments << 'INFO: Processing xml'
    harvester.process_xml(body: body, metadata: @meta)
  when 'json'
    @meta.comments << 'INFO: Processing json'
    harvester.process_json(body: body, metadata: @meta)
  when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
    warn "PROCESSING USING TURTLE"
    @meta.comments << 'INFO: Processing linked data'
    harvester.process_ld(body: body, content_type: content_type, metadata: @meta)
  when 'specialist'
    warn 'no specialized parsers so far'
  end
end

.validate_claimed_type(abbreviation:, claimed_type:) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/metadata_harvester.rb', line 140

def self.validate_claimed_type(abbreviation:, claimed_type:)
    warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n"
    claimed_type.gsub!(/\s*;.*/, '')

    case abbreviation
    when 'html'
      return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type
    when 'xml'
      return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type
    when 'json'
      return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type
    when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads'
      return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type
    when 'specialist'
      warn 'no specialized parsers so far'
    end
    return false
end