Class: RelatonW3c::DataParser
- Inherits:
-
Object
- Object
- RelatonW3c::DataParser
- Defined in:
- lib/relaton_w3c/data_parser.rb
Constant Summary collapse
- USED_TYPES =
%w[WD NOTE PER PR REC CR].freeze
- DOCTYPES =
{ "TR" => "technicalReport", "NOTE" => "groupNote", }.freeze
- STAGES =
{ "RET" => "Retired", "SPSD" => "Superseded Recommendation", "OBSL" => "Obsoleted Recommendation", "WD" => "Working Draft", "CRD" => "Candidate Recommendation Draft", "CR" => "Candidate Recommendation", "PR" => "Proposed Recommendation", "PER" => "Proposed Edited Recommendation", "REC" => "Recommendation", }.freeze
Class Method Summary collapse
-
.parse(rdf, sol, fetcher) ⇒ RelatonW3c:W3cBibliographicItem?
Initialize document parser and run it.
-
.parse_identifier(url) ⇒ String
Parse identifier from URL.
Instance Method Summary collapse
- #create_editor(name) ⇒ Object
-
#create_relation(url, type, desc = nil) ⇒ RelatonBib::DocumentRelation
Create relation.
-
#document_versions ⇒ Array<RDF::Query::Solution>
Query document versions relations.
-
#editor_drafts ⇒ Array<RelatonBib::TypedUri>
Parse editor drafts links.
-
#identifier(link = nil) ⇒ String
Generate identifier from URL.
-
#initialize(rdf, sol, fetcher) ⇒ DataParser
constructor
Document parser initalization.
-
#parse ⇒ RelatonW3c:W3cBibliographicItem?
Parse document.
-
#parse_contrib ⇒ Array<RelatonBib::ContributionInfo>
Parse contributor.
-
#parse_date ⇒ Array<RelatonBib::BibliographicDate>
Parse date.
-
#parse_docid ⇒ Arra<RelatonBib::DocumentIdentifier>
Parse docidentifier.
-
#parse_docstatus ⇒ RelatonBib::DocumentStatus?
Extract documetn status.
-
#parse_doctype ⇒ String?
Parse doctype.
-
#parse_editorialgroup ⇒ RelatonBib::EditorialGroup
Parse editorialgroup.
-
#parse_formattedref ⇒ RelatonBib::FormattedRef
Parse formattedref.
-
#parse_link ⇒ Array<RelatonBib::TypedUri>
Parse link.
-
#parse_relation ⇒ Array<RelatonBib::DocumentRelation>
Parse relation.
-
#parse_series ⇒ Array<RelatonBib::Series>
Parse series.
-
#parse_title ⇒ RelatonBib::TypedTitleStringCollection
Parse title.
-
#pub_id(url) ⇒ RDF::URI
Generate PubID.
-
#relation_query(predicate) ⇒ RDF::Query::Solutions
Query for relations.
-
#relations ⇒ Array<RelatonBib::DocumentRelation>
Create relations.
-
#type ⇒ String
Extract type.
-
#type_from_link ⇒ String?
Fetch type from link.
-
#types_stages ⇒ Array<String>
Fetches types and stages.
-
#unversioned_types_stages ⇒ SPARQL::Algebra::Operator::Prefix
Create SPARQL query for unversioned types and stages.
-
#version_of ⇒ RDF::Query::Solutions
Query for document versions.
-
#versioned_types_stages ⇒ SPARQL::Algebra::Operator::Prefix
Create SPARQL query for versioned types and stages.
Constructor Details
#initialize(rdf, sol, fetcher) ⇒ DataParser
Document parser initalization
28 29 30 31 32 |
# File 'lib/relaton_w3c/data_parser.rb', line 28 def initialize(rdf, sol, fetcher) @rdf = rdf @sol = sol @fetcher = fetcher end |
Class Method Details
.parse(rdf, sol, fetcher) ⇒ RelatonW3c:W3cBibliographicItem?
Initialize document parser and run it
42 43 44 |
# File 'lib/relaton_w3c/data_parser.rb', line 42 def self.parse(rdf, sol, fetcher) new(rdf, sol, fetcher).parse end |
.parse_identifier(url) ⇒ String
Parse identifier from URL
144 145 146 147 148 149 |
# File 'lib/relaton_w3c/data_parser.rb', line 144 def self.parse_identifier(url) if /.+\/(\w+(?:[-+][\w.]+)+(?:\/\w+)?)/ =~ url.to_s $1.to_s else url.to_s.split("/").last end end |
Instance Method Details
#create_editor(name) ⇒ Object
419 420 421 422 423 424 |
# File 'lib/relaton_w3c/data_parser.rb', line 419 def create_editor(name) cn = RelatonBib::LocalizedString.new(name, "en", "Latn") n = RelatonBib::FullName.new completename: cn p = RelatonBib::Person.new name: n RelatonBib::ContributionInfo.new(entity: p, role: [type: "editor"]) end |
#create_relation(url, type, desc = nil) ⇒ RelatonBib::DocumentRelation
Create relation
373 374 375 376 377 378 379 380 381 |
# File 'lib/relaton_w3c/data_parser.rb', line 373 def create_relation(url, type, desc = nil) id = pub_id(url) fref = RelatonBib::FormattedRef.new content: id docid = RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true) link = [RelatonBib::TypedUri.new(type: "src", content: url)] bib = W3cBibliographicItem.new formattedref: fref, docid: [docid], link: link dsc = RelatonBib::FormattedString.new content: desc if desc RelatonBib::DocumentRelation.new(type: type, bibitem: bib, description: dsc) end |
#document_versions ⇒ Array<RDF::Query::Solution>
Query document versions relations
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 |
# File 'lib/relaton_w3c/data_parser.rb', line 327 def document_versions # rubocop:disable Metrics/MethodLength @document_versions ||= version_of.each_with_object([]) do |s, acc| sse = SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX dc: <http://purl.org/dc/elements/1.1/> PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?link ?title ?date WHERE { ?link doc:versionOf <#{s.version_of}> ; dc:title ?title ; dc:date ?date . } )) @rdf.query(sse).each { |r| acc << r } end end |
#editor_drafts ⇒ Array<RelatonBib::TypedUri>
Parse editor drafts links
287 288 289 290 291 292 293 294 295 296 297 298 299 |
# File 'lib/relaton_w3c/data_parser.rb', line 287 def editor_drafts # rubocop:disable Metrics/MethodLength return [] unless @sol.respond_to?(:link) sse = SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?latest WHERE { <#{@sol.link.to_s.strip}> :ED ?latest . } )) @rdf.query(sse).map do |s| RelatonBib::TypedUri.new(type: "current", content: s.latest.to_s.strip) end end |
#identifier(link = nil) ⇒ String
Generate identifier from URL
132 133 134 135 |
# File 'lib/relaton_w3c/data_parser.rb', line 132 def identifier(link = nil) url = link || (@sol.respond_to?(:link) ? @sol.link : @sol.version_of) self.class.parse_identifier(url.to_s.strip) end |
#parse ⇒ RelatonW3c:W3cBibliographicItem?
Parse document
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/relaton_w3c/data_parser.rb', line 51 def parse # rubocop:disable Metrics/MethodLength, Metrics/AbcSize return if @sol.respond_to?(:link) && !types_stages.detect { |ts| USED_TYPES.include?(ts) } RelatonW3c::W3cBibliographicItem.new( type: "standard", doctype: parse_doctype, language: ["en"], script: ["Latn"], docstatus: parse_docstatus, title: parse_title, link: parse_link, docid: parse_docid, formattedref: parse_formattedref, docnumber: identifier, series: parse_series, date: parse_date, relation: parse_relation, contributor: parse_contrib, editorialgroup: parse_editorialgroup, ) end |
#parse_contrib ⇒ Array<RelatonBib::ContributionInfo>
Parse contributor
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 |
# File 'lib/relaton_w3c/data_parser.rb', line 399 def parse_contrib # rubocop:disable Metrics/MethodLength publisher = RelatonBib::Organization.new( name: "World Wide Web Consortium", abbreviation: "W3C", url: "https://www.w3.org/" ) contribs = [RelatonBib::ContributionInfo.new(entity: publisher, role: [type: "publisher"])] return contribs unless @sol.respond_to?(:link) sse = SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#> SELECT ?full_name WHERE { <#{@sol.link.to_s.strip}> :editor/contact:fullName ?full_name } )) @rdf.query(sse).order_by(:full_name).each_with_object(contribs) do |ed, obj| obj << create_editor(ed.full_name.to_s) end end |
#parse_date ⇒ Array<RelatonBib::BibliographicDate>
Parse date
244 245 246 247 248 |
# File 'lib/relaton_w3c/data_parser.rb', line 244 def parse_date return [] unless @sol.respond_to?(:date) [RelatonBib::BibliographicDate.new(type: "published", on: @sol.date.to_s)] end |
#parse_docid ⇒ Arra<RelatonBib::DocumentIdentifier>
Parse docidentifier
111 112 113 114 |
# File 'lib/relaton_w3c/data_parser.rb', line 111 def parse_docid id = @sol.respond_to?(:link) ? pub_id(@sol.link) : pub_id(@sol.version_of) [RelatonBib::DocumentIdentifier.new(type: "W3C", id: id, primary: true)] end |
#parse_docstatus ⇒ RelatonBib::DocumentStatus?
Extract documetn status
78 79 80 81 |
# File 'lib/relaton_w3c/data_parser.rb', line 78 def parse_docstatus stage = types_stages&.detect { |st| STAGES.include?(st) } RelatonBib::DocumentStatus.new stage: STAGES[stage] if stage end |
#parse_doctype ⇒ String?
Parse doctype
224 225 226 227 |
# File 'lib/relaton_w3c/data_parser.rb', line 224 def parse_doctype type = DOCTYPES[type] || DOCTYPES[type_from_link] DocumentType.new(type: type) if type end |
#parse_editorialgroup ⇒ RelatonBib::EditorialGroup
Parse editorialgroup
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 |
# File 'lib/relaton_w3c/data_parser.rb', line 431 def parse_editorialgroup # rubocop:disable Metrics/MethodLength, Metrics/AbcSize return unless @sol.respond_to?(:link) sse = SPARQL.parse(%( PREFIX org: <http://www.w3.org/2001/04/roadmap/org#> PREFIX contact: <http://www.w3.org/2000/10/swap/pim/contact#> SELECT ?home_page WHERE { <#{@sol.link.to_s.strip}> org:deliveredBy/contact:homePage ?home_page } )) res = @rdf.query(sse).order_by(:home_page) tc = res.each_with_object([]) do |edg, obj| group_path = edg.home_page.to_s.sub(/^https?:\/\//, "").sub(/\/$/, "") wg = @fetcher.group_names[group_path] if wg rwg = RelatonBib::WorkGroup.new name: wg["name"] obj << RelatonBib::TechnicalCommittee.new(rwg) else Util.warn "Working group name not found for: `#{edg.home_page}`" end end RelatonBib::EditorialGroup.new tc end |
#parse_formattedref ⇒ RelatonBib::FormattedRef
Parse formattedref
388 389 390 391 392 |
# File 'lib/relaton_w3c/data_parser.rb', line 388 def parse_formattedref return if @sol.respond_to?(:link) RelatonBib::FormattedRef.new(content: pub_id(@sol.version_of)) end |
#parse_link ⇒ Array<RelatonBib::TypedUri>
Parse link
101 102 103 104 |
# File 'lib/relaton_w3c/data_parser.rb', line 101 def parse_link link = @sol.respond_to?(:link) ? @sol.link : @sol.version_of [RelatonBib::TypedUri.new(type: "src", content: link.to_s.strip)] + editor_drafts end |
#parse_relation ⇒ Array<RelatonBib::DocumentRelation>
Parse relation
255 256 257 258 259 260 261 |
# File 'lib/relaton_w3c/data_parser.rb', line 255 def parse_relation if @sol.respond_to?(:link) relations else document_versions.map { |r| create_relation(r.link.to_s.strip, "hasEdition") } end end |
#parse_series ⇒ Array<RelatonBib::Series>
Parse series
156 157 158 159 160 161 |
# File 'lib/relaton_w3c/data_parser.rb', line 156 def parse_series return [] unless type title = RelatonBib::TypedTitleString.new content: "W3C #{type}" [RelatonBib::Series.new(title: title, number: identifier)] end |
#parse_title ⇒ RelatonBib::TypedTitleStringCollection
Parse title
88 89 90 91 92 93 94 |
# File 'lib/relaton_w3c/data_parser.rb', line 88 def parse_title content = if @sol.respond_to?(:title) then @sol.title.to_s else document_versions.max_by { |dv| dv.date.to_s }.title.to_s end t = RelatonBib::TypedTitleString.new content: content RelatonBib::TypedTitleStringCollection.new [t] end |
#pub_id(url) ⇒ RDF::URI
Generate PubID
121 122 123 |
# File 'lib/relaton_w3c/data_parser.rb', line 121 def pub_id(url) "W3C #{identifier(url)}" end |
#relation_query(predicate) ⇒ RDF::Query::Solutions
Query for relations
308 309 310 311 312 313 314 315 316 317 318 319 320 |
# File 'lib/relaton_w3c/data_parser.rb', line 308 def relation_query(predicate) sse = SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> PREFIX mat: <http://www.w3.org/2002/05/matrix/vocab#> SELECT ?rel WHERE { <#{@sol.link.to_s.strip}> #{predicate} ?rel . FILTER ( isURI(?rel) ) } )) @rdf.query(sse).order_by(:rel) end |
#relations ⇒ Array<RelatonBib::DocumentRelation>
Create relations
268 269 270 271 272 273 274 275 276 277 278 279 280 |
# File 'lib/relaton_w3c/data_parser.rb', line 268 def relations # rubocop:disable Metrics/MethodLength, Metrics/AbcSize { "doc:obsoletes" => { type: "obsoletes" }, "mat:hasErrata" => { type: "updatedBy", description: "errata" }, # "mat:hasTranslations" => "hasTranslation", # "mat:hasImplReport" => "hasImpReport", ":previousEdition" => { type: "editionOf" }, }.reduce([]) do |acc, (predicate, tp)| acc + relation_query(predicate).map do |r| create_relation(r.rel.to_s, tp[:type], tp[:description]) end end end |
#type ⇒ String
Extract type
168 169 170 171 |
# File 'lib/relaton_w3c/data_parser.rb', line 168 def type # thre are many types, we need to find the right one @type ||= types_stages&.detect { |t| USED_TYPES.include?(t) } || "technicalReport" end |
#type_from_link ⇒ String?
Fetch type from link
234 235 236 237 |
# File 'lib/relaton_w3c/data_parser.rb', line 234 def type_from_link link = @sol.respond_to?(:link) ? @sol.link : @sol.version_of link.to_s.strip.match(/www\.w3\.org\/(TR)/)&.to_a&.fetch 1 end |
#types_stages ⇒ Array<String>
Fetches types and stages
178 179 180 181 182 183 |
# File 'lib/relaton_w3c/data_parser.rb', line 178 def types_stages @types_stages ||= begin sse = @sol.respond_to?(:link) ? versioned_types_stages : unversioned_types_stages @rdf.query(sse).map { |s| s.type.to_s.split("#").last } end end |
#unversioned_types_stages ⇒ SPARQL::Algebra::Operator::Prefix
Create SPARQL query for unversioned types and stages
206 207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/relaton_w3c/data_parser.rb', line 206 def unversioned_types_stages SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> SELECT ?type WHERE { ?link doc:versionOf <#{@sol.version_of}>; rdf:type ?type . FILTER ( isURI(?link) && STR(?link) != <#{@sol.version_of}> ) } )) end |
#version_of ⇒ RDF::Query::Solutions
Query for document versions
350 351 352 353 354 355 356 357 358 359 360 361 362 |
# File 'lib/relaton_w3c/data_parser.rb', line 350 def version_of return [@sol] unless @sol.respond_to?(:link) sse = SPARQL.parse(%( PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#> SELECT ?version_of WHERE { <#{@sol.link.to_s.strip}> doc:versionOf ?version_of . FILTER ( isURI(?version_of) && <#{@sol.link.to_s.strip}> != str(?version_of) ) } )) @rdf.query(sse) end |
#versioned_types_stages ⇒ SPARQL::Algebra::Operator::Prefix
Create SPARQL query for versioned types and stages
190 191 192 193 194 195 196 197 198 199 |
# File 'lib/relaton_w3c/data_parser.rb', line 190 def versioned_types_stages SPARQL.parse(%( PREFIX : <http://www.w3.org/2001/02pd/rec54#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?type WHERE { { <#{@sol.link.to_s.strip}> rdf:type ?type } } )) end |