Class: RelatonW3c::DataParser

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_w3c/data_parser.rb

Constant Summary collapse

USED_TYPES =
%w[WD NOTE PER PR REC CR].freeze
DOCTYPES =
{
  "TR" => "technicalReport",
  "NOTE" => "groupNote",
}.freeze
STAGES =
{
  "RET" => "Retired",
  "SPSD" => "Superseded Recommendation",
  "OBSL" => "Obsoleted Recommendation",
  "WD" => "Working Draft",
  "CRD" => "Candidate Recommendation Draft",
  "CR" => "Candidate Recommendation",
  "PR" => "Proposed Recommendation",
  "PER" => "Proposed Edited Recommendation",
  "REC" => "Recommendation",
}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(rdf, sol, fetcher) ⇒ DataParser

Document parser initalization

Parameters:



28
29
30
31
32
# File 'lib/relaton_w3c/data_parser.rb', line 28

def initialize(rdf, sol, fetcher)
  @rdf = rdf
  @sol = sol
  @fetcher = fetcher
end

Class Method Details

.parse(rdf, sol, fetcher) ⇒ RelatonW3c:W3cBibliographicItem?

Initialize document parser and run it

Parameters:

Returns:

  • (RelatonW3c:W3cBibliographicItem, nil)

    bibliographic item



42
43
44
# File 'lib/relaton_w3c/data_parser.rb', line 42

def self.parse(rdf, sol, fetcher)
  new(rdf, sol, fetcher).parse
end

.parse_identifier(url) ⇒ String

Parse identifier from URL

Parameters:

  • url (String)

    URL

Returns:

  • (String)

    identifier



144
145
146
147
148
149
# File 'lib/relaton_w3c/data_parser.rb', line 144

def self.parse_identifier(url)
  if /.+\/(\w+(?:[-+][\w.]+)+(?:\/\w+)?)/ =~ url.to_s
    $1.to_s
  else url.to_s.split("/").last
  end
end

Instance Method Details

#create_editor(name) ⇒ Object



419
420
421
422
423
424
# File 'lib/relaton_w3c/data_parser.rb', line 419

def create_editor(name)
  cn = RelatonBib::LocalizedString.new(name, "en", "Latn")
  n = RelatonBib::FullName.new completename: cn
  p = RelatonBib::Person.new name: n
  RelatonBib::ContributionInfo.new(entity: p, role: [type: "editor"])
end

#create_relation(url, type, desc = nil) ⇒ RelatonBib::DocumentRelation

Create relation

Parameters:

  • url (String)

    relation URL

  • type (String)

    relation type

  • desc (String, nil) (defaults to: nil)

    relation description

Returns:

  • (RelatonBib::DocumentRelation)

    <description>



373
374
375
376
377
378
379
380
381
# File 'lib/relaton_w3c/data_parser.rb', line 373

def create_relation(url, type, desc = nil)
  id = pub_id(url)
  fref = RelatonBib::FormattedRef.new content: id
  docid = RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true)
  link = [RelatonBib::TypedUri.new(type: "src", content: url)]
  bib = W3cBibliographicItem.new formattedref: fref, docid: [docid], link: link
  dsc = RelatonBib::FormattedString.new content: desc if desc
  RelatonBib::DocumentRelation.new(type: type, bibitem: bib, description: dsc)
end

#document_versionsArray<RDF::Query::Solution>

Query document versions relations

Returns:

  • (Array<RDF::Query::Solution>)

    query results



327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# File 'lib/relaton_w3c/data_parser.rb', line 327

def document_versions # rubocop:disable Metrics/MethodLength
  @document_versions ||= version_of.each_with_object([]) do |s, acc|
    sse = SPARQL.parse(%(
      PREFIX : <http://www.w3.org/2001/02pd/rec54#>
      PREFIX dc: <http://purl.org/dc/elements/1.1/>
      PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
      PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      SELECT ?link ?title ?date
      WHERE {
        ?link doc:versionOf <#{s.version_of}> ;
        dc:title ?title ;
        dc:date ?date .
      }
    ))
    @rdf.query(sse).each { |r| acc << r }
  end
end

#editor_draftsArray<RelatonBib::TypedUri>

Parse editor drafts links

Returns:

  • (Array<RelatonBib::TypedUri>)

    links



287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/relaton_w3c/data_parser.rb', line 287

def editor_drafts # rubocop:disable Metrics/MethodLength
  return [] unless @sol.respond_to?(:link)

  sse = SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    SELECT ?latest
    WHERE { <#{@sol.link.to_s.strip}> :ED ?latest . }
  ))
  @rdf.query(sse).map do |s|
    RelatonBib::TypedUri.new(type: "current", content: s.latest.to_s.strip)
  end
end

#identifier(link = nil) ⇒ String

Generate identifier from URL

Parameters:

  • link (RDF::URI, nil) (defaults to: nil)

Returns:

  • (String)

    identifier



132
133
134
135
# File 'lib/relaton_w3c/data_parser.rb', line 132

def identifier(link = nil)
  url = link || (@sol.respond_to?(:link) ? @sol.link : @sol.version_of)
  self.class.parse_identifier(url.to_s.strip)
end

#parseRelatonW3c:W3cBibliographicItem?

Parse document

Returns:

  • (RelatonW3c:W3cBibliographicItem, nil)

    bibliographic item



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/relaton_w3c/data_parser.rb', line 51

def parse # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  return if @sol.respond_to?(:link) && !types_stages.detect { |ts| USED_TYPES.include?(ts) }

  RelatonW3c::W3cBibliographicItem.new(
    type: "standard",
    doctype: parse_doctype,
    language: ["en"],
    script: ["Latn"],
    docstatus: parse_docstatus,
    title: parse_title,
    link: parse_link,
    docid: parse_docid,
    formattedref: parse_formattedref,
    docnumber: identifier,
    series: parse_series,
    date: parse_date,
    relation: parse_relation,
    contributor: parse_contrib,
    editorialgroup: parse_editorialgroup,
  )
end

#parse_contribArray<RelatonBib::ContributionInfo>

Parse contributor

Returns:

  • (Array<RelatonBib::ContributionInfo>)

    contributor



399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# File 'lib/relaton_w3c/data_parser.rb', line 399

def parse_contrib # rubocop:disable Metrics/MethodLength
  publisher = RelatonBib::Organization.new(
    name: "World Wide Web Consortium", abbreviation: "W3C", url: "https://www.w3.org/"
  )
  contribs = [RelatonBib::ContributionInfo.new(entity: publisher, role: [type: "publisher"])]
  return contribs unless @sol.respond_to?(:link)

  sse = SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
    SELECT ?full_name
    WHERE {
      <#{@sol.link.to_s.strip}> :editor/contact:fullName ?full_name
    }
  ))
  @rdf.query(sse).order_by(:full_name).each_with_object(contribs) do |ed, obj|
    obj << create_editor(ed.full_name.to_s)
  end
end

#parse_dateArray<RelatonBib::BibliographicDate>

Parse date

Returns:

  • (Array<RelatonBib::BibliographicDate>)

    date



244
245
246
247
248
# File 'lib/relaton_w3c/data_parser.rb', line 244

def parse_date
  return [] unless @sol.respond_to?(:date)

  [RelatonBib::BibliographicDate.new(type: "published", on: @sol.date.to_s)]
end

#parse_docidArra<RelatonBib::DocumentIdentifier>

Parse docidentifier

Returns:

  • (Arra<RelatonBib::DocumentIdentifier>)

    docidentifier



111
112
113
114
# File 'lib/relaton_w3c/data_parser.rb', line 111

def parse_docid
  id = @sol.respond_to?(:link) ? pub_id(@sol.link) : pub_id(@sol.version_of)
  [RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true)]
end

#parse_docstatusRelatonBib::DocumentStatus?

Extract documetn status

Returns:

  • (RelatonBib::DocumentStatus, nil)

    dcoument status



78
79
80
81
# File 'lib/relaton_w3c/data_parser.rb', line 78

def parse_docstatus
  stage = types_stages&.detect { |st| STAGES.include?(st) }
  RelatonBib::DocumentStatus.new stage: STAGES[stage] if stage
end

#parse_doctypeString?

Parse doctype

Returns:

  • (String, nil)

    doctype



224
225
226
227
# File 'lib/relaton_w3c/data_parser.rb', line 224

def parse_doctype
  type = DOCTYPES[type] || DOCTYPES[type_from_link]
  DocumentType.new(type: type) if type
end

#parse_editorialgroupRelatonBib::EditorialGroup

Parse editorialgroup

Returns:

  • (RelatonBib::EditorialGroup)

    editorialgroup



431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# File 'lib/relaton_w3c/data_parser.rb', line 431

def parse_editorialgroup # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  return unless @sol.respond_to?(:link)

  sse = SPARQL.parse(%(
    PREFIX org: <http://www.w3.org/2001/04/roadmap/org#>
    PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#>
    SELECT ?home_page
    WHERE {
      <#{@sol.link.to_s.strip}> org:deliveredBy/contact:homePage ?home_page
    }
  ))
  res = @rdf.query(sse).order_by(:home_page)
  tc = res.each_with_object([]) do |edg, obj|
    group_path = edg.home_page.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "")
    wg = @fetcher.group_names[group_path]
    if wg
      rwg = RelatonBib::WorkGroup.new name: wg["name"]
      obj << RelatonBib::TechnicalCommittee.new(rwg)
    else
      Util.warn "Working group name not found for: `#{edg.home_page}`"
    end
  end
  RelatonBib::EditorialGroup.new tc
end

#parse_formattedrefRelatonBib::FormattedRef

Parse formattedref

Returns:

  • (RelatonBib::FormattedRef)

    formattedref



388
389
390
391
392
# File 'lib/relaton_w3c/data_parser.rb', line 388

def parse_formattedref
  return if @sol.respond_to?(:link)

  RelatonBib::FormattedRef.new(content: pub_id(@sol.version_of))
end

Parse link

Returns:

  • (Array<RelatonBib::TypedUri>)

    link



101
102
103
104
# File 'lib/relaton_w3c/data_parser.rb', line 101

def parse_link
  link = @sol.respond_to?(:link) ? @sol.link : @sol.version_of
  [RelatonBib::TypedUri.new(type: "src", content: link.to_s.strip)] + editor_drafts
end

#parse_relationArray<RelatonBib::DocumentRelation>

Parse relation

Returns:

  • (Array<RelatonBib::DocumentRelation>)

    relation



255
256
257
258
259
260
261
# File 'lib/relaton_w3c/data_parser.rb', line 255

def parse_relation
  if @sol.respond_to?(:link)
    relations
  else
    document_versions.map { |r| create_relation(r.link.to_s.strip, "hasEdition") }
  end
end

#parse_seriesArray<RelatonBib::Series>

Parse series

Returns:

  • (Array<RelatonBib::Series>)

    series



156
157
158
159
160
161
# File 'lib/relaton_w3c/data_parser.rb', line 156

def parse_series
  return [] unless type

  title = RelatonBib::TypedTitleString.new content: "W3C #{type}"
  [RelatonBib::Series.new(title: title, number: identifier)]
end

#parse_titleRelatonBib::TypedTitleStringCollection

Parse title

Returns:

  • (RelatonBib::TypedTitleStringCollection)

    title



88
89
90
91
92
93
94
# File 'lib/relaton_w3c/data_parser.rb', line 88

def parse_title
  content = if @sol.respond_to?(:title) then @sol.title.to_s
            else document_versions.max_by { |dv| dv.date.to_s }.title.to_s
            end
  t = RelatonBib::TypedTitleString.new content: content
  RelatonBib::TypedTitleStringCollection.new [t]
end

#pub_id(url) ⇒ RDF::URI

Generate PubID

Returns:

  • (RDF::URI)

    PubID



121
122
123
# File 'lib/relaton_w3c/data_parser.rb', line 121

def pub_id(url)
  "W3C #{identifier(url)}"
end

#relation_query(predicate) ⇒ RDF::Query::Solutions

Query for relations

Parameters:

  • predicate (String)

    relation type

Returns:

  • (RDF::Query::Solutions)

    query result



308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/relaton_w3c/data_parser.rb', line 308

def relation_query(predicate)
  sse = SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
    PREFIX mat: <http://www.w3.org/2002/05/matrix/vocab#>
    SELECT ?rel
    WHERE {
      <#{@sol.link.to_s.strip}> #{predicate} ?rel .
      FILTER ( isURI(?rel) )
    }
  ))
  @rdf.query(sse).order_by(:rel)
end

#relationsArray<RelatonBib::DocumentRelation>

Create relations

Returns:

  • (Array<RelatonBib::DocumentRelation>)

    relations



268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/relaton_w3c/data_parser.rb', line 268

def relations # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  {
    "doc:obsoletes" => { type: "obsoletes" },
    "mat:hasErrata" => { type: "updatedBy", description: "errata" },
    # "mat:hasTranslations" => "hasTranslation",
    # "mat:hasImplReport" => "hasImpReport",
    ":previousEdition" => { type: "editionOf" },
  }.reduce([]) do |acc, (predicate, tp)|
    acc + relation_query(predicate).map do |r|
      create_relation(r.rel.to_s, tp[:type], tp[:description])
    end
  end
end

#typeString

Extract type

Returns:

  • (String)

    type



168
169
170
171
# File 'lib/relaton_w3c/data_parser.rb', line 168

def type
  # thre are many types, we need to find the right one
  @type ||= types_stages&.detect { |t| USED_TYPES.include?(t) } || "technicalReport"
end

Fetch type from link

Returns:

  • (String, nil)

    type



234
235
236
237
# File 'lib/relaton_w3c/data_parser.rb', line 234

def type_from_link
  link = @sol.respond_to?(:link) ? @sol.link : @sol.version_of
  link.to_s.strip.match(/www\.w3\.org\/(TR)/)&.to_a&.fetch 1
end

#types_stagesArray<String>

Fetches types and stages

Returns:

  • (Array<String>)

    types and stages



178
179
180
181
182
183
# File 'lib/relaton_w3c/data_parser.rb', line 178

def types_stages
  @types_stages ||= begin
    sse = @sol.respond_to?(:link) ? versioned_types_stages : unversioned_types_stages
    @rdf.query(sse).map { |s| s.type.to_s.split("#").last }
  end
end

#unversioned_types_stagesSPARQL::Algebra::Operator::Prefix

Create SPARQL query for unversioned types and stages

Returns:

  • (SPARQL::Algebra::Operator::Prefix)

    SPARQL query



206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/relaton_w3c/data_parser.rb', line 206

def unversioned_types_stages
  SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
    SELECT ?type
    WHERE {
      ?link doc:versionOf <#{@sol.version_of}>; rdf:type ?type .
      FILTER ( isURI(?link) && STR(?link) != <#{@sol.version_of}> )
    }
  ))
end

#version_ofRDF::Query::Solutions

Query for document versions

Returns:

  • (RDF::Query::Solutions)

    query results



350
351
352
353
354
355
356
357
358
359
360
361
362
# File 'lib/relaton_w3c/data_parser.rb', line 350

def version_of
  return [@sol] unless @sol.respond_to?(:link)

  sse = SPARQL.parse(%(
    PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
    SELECT ?version_of
    WHERE {
      <#{@sol.link.to_s.strip}> doc:versionOf ?version_of .
      FILTER ( isURI(?version_of) && <#{@sol.link.to_s.strip}> != str(?version_of) )
    }
  ))
  @rdf.query(sse)
end

#versioned_types_stagesSPARQL::Algebra::Operator::Prefix

Create SPARQL query for versioned types and stages

Returns:

  • (SPARQL::Algebra::Operator::Prefix)

    SPARQL query



190
191
192
193
194
195
196
197
198
199
# File 'lib/relaton_w3c/data_parser.rb', line 190

def versioned_types_stages
  SPARQL.parse(%(
    PREFIX : <http://www.w3.org/2001/02pd/rec54#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    SELECT ?type
    WHERE {
      { <#{@sol.link.to_s.strip}> rdf:type ?type }
    }
  ))
end