Module: Bolognese::Readers::SchemaOrgReader

Included in:
MetadataUtils
Defined in:
lib/bolognese/readers/schema_org_reader.rb

Constant Summary collapse

SO_TO_DC_RELATION_TYPES =
{
  "citation" => "References",
  "sameAs" => "IsIdenticalTo",
  "isPartOf" => "IsPartOf",
  "hasPart" => "HasPart",
  "isPredecessor" => "IsPreviousVersionOf",
  "isSuccessor" => "IsNewVersionOf"
}

Instance Method Summary collapse

Instance Method Details

#get_schema_org(id: nil, **options) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/bolognese/readers/schema_org_reader.rb', line 15

def get_schema_org(id: nil, **options)
  return { "string" => nil, "state" => "not_found" } unless id.present?

  id = normalize_id(id)
  response = Maremma.get(id)
  doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')

  # workaround for xhtml documents
  nodeset = doc.css("script")
  string = nodeset.find { |element| element["type"] == "application/ld+json" }
  string = string.text if string.present?

  { "string" => string }
end

#read_schema_org(string: nil, **options) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/bolognese/readers/schema_org_reader.rb', line 30

def read_schema_org(string: nil, **options)
  if string.present?
    errors = jsonlint(string)
    return { "errors" => errors } if errors.present?
  end

  meta = string.present? ? Maremma.from_json(string) : {}

  identifier = Array.wrap(meta.fetch("identifier", nil))
  if identifier.length > 1
    alternate_identifier = identifier[1..-1].map do |r|
      if r.is_a?(String)
        { "type" => "URL", "name" => r }
      elsif r.is_a?(Hash)
        { "type" => r["propertyID"], "name" => r["value"] }
      end
    end.unwrap
  else
    alternate_identifier = nil
  end
  identifier = identifier.first

  id = normalize_id(meta.fetch("@id", nil) || meta.fetch("identifier", nil))
  type = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
  resource_type_general = Bolognese::Utils::SO_TO_DC_TRANSLATIONS[type]
  authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
  author = get_authors(from_schema_org(Array.wrap(authors)))
  editor = get_authors(from_schema_org(Array.wrap(meta.fetch("editor", nil))))
  publisher = parse_attributes(meta.fetch("publisher", nil), content: "name", first: true)

  included_in_data_catalog = from_schema_org(Array.wrap(meta.fetch("includedInDataCatalog", nil)))
  included_in_data_catalog = Array.wrap(included_in_data_catalog).reduce([]) do |sum, dc| 
    sum << { "title" => dc["name"], "url" => dc["url"] } if dc["url"].present?
    sum
  end.unwrap
  is_part_of = schema_org_is_part_of(meta) || included_in_data_catalog

  license = {
    "id" => parse_attributes(meta.fetch("license", nil), content: "id", first: true),
    "name" => parse_attributes(meta.fetch("license", nil), content: "name", first: true)
  }

  funding = from_schema_org(Array.wrap(meta.fetch("funding", nil)))
  date_published = meta.fetch("datePublished", nil)
  state = meta.present? ? "findable" : "not_found"
  
  ct = (type == "Dataset") ? "includedInDataCatalog" : "Periodical"
  container_title = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "name", first: true)

  { "id" => id,
    "type" => type,
    "additional_type" => meta.fetch("additionalType", nil),
    "citeproc_type" => Bolognese::Utils::SO_TO_CP_TRANSLATIONS[type] || "article-journal",
    "bibtex_type" => Bolognese::Utils::SO_TO_BIB_TRANSLATIONS[type] || "misc",
    "ris_type" => Bolognese::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN",
    "resource_type_general" => resource_type_general,
    "doi" => validate_doi(id),
    "identifier" => identifier,
    "alternate_identifier" => alternate_identifier,
    "b_url" => normalize_id(meta.fetch("url", nil)),
    "content_url" => Array.wrap(meta.fetch("contentUrl", nil)).unwrap,
    "content_size" => meta.fetch("contenSize", nil),
    "content_format" => Array.wrap(meta.fetch("encodingFormat", nil) || meta.fetch("fileFormat", nil)).unwrap,
    "title" => meta.fetch("name", nil),
    "author" => author,
    "editor" => editor,
    "publisher" => publisher,
    "service_provider" => parse_attributes(meta.fetch("provider", nil), content: "name", first: true),
    "container_title" => container_title,
    "is_identical_to" => schema_org_is_identical_to(meta),
    "is_part_of" => is_part_of,
    "has_part" => schema_org_has_part(meta),
    "references" => schema_org_references(meta),
    "is_referenced_by" => schema_org_is_referenced_by(meta),
    "is_supplement_to" => schema_org_is_supplement_to(meta),
    "is_supplemented_by" => schema_org_is_supplemented_by(meta),
    "date_created" => meta.fetch("dateCreated", nil),
    "date_published" => date_published,
    "date_modified" => meta.fetch("dateModified", nil),
    "description" => meta.fetch("description", nil).present? ? { "text" => sanitize(meta.fetch("description")) } : nil,
    "license" => license,
    "b_version" => meta.fetch("version", nil),
    "keywords" => meta.fetch("keywords", nil).to_s.split(", "),
    "state" => state,
    "schema_version" => meta.fetch("schemaVersion", nil),
    "funding" => funding
  }
end

#schema_org_has_part(meta) ⇒ Object



135
136
137
# File 'lib/bolognese/readers/schema_org_reader.rb', line 135

def schema_org_has_part(meta)
  schema_org_related_identifier(meta, relation_type: "hasPart")
end

#schema_org_is_identical_to(meta) ⇒ Object



127
128
129
# File 'lib/bolognese/readers/schema_org_reader.rb', line 127

def schema_org_is_identical_to(meta)
  schema_org_related_identifier(meta, relation_type: "sameAs")
end

#schema_org_is_new_version_of(meta) ⇒ Object



143
144
145
# File 'lib/bolognese/readers/schema_org_reader.rb', line 143

def schema_org_is_new_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "SuccessorOf")
end

#schema_org_is_part_of(meta) ⇒ Object



131
132
133
# File 'lib/bolognese/readers/schema_org_reader.rb', line 131

def schema_org_is_part_of(meta)
  schema_org_related_identifier(meta, relation_type: "isPartOf")
end

#schema_org_is_previous_version_of(meta) ⇒ Object



139
140
141
# File 'lib/bolognese/readers/schema_org_reader.rb', line 139

def schema_org_is_previous_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "PredecessorOf")
end

#schema_org_is_referenced_by(meta) ⇒ Object



151
152
153
# File 'lib/bolognese/readers/schema_org_reader.rb', line 151

def schema_org_is_referenced_by(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "citation")
end

#schema_org_is_supplement_to(meta) ⇒ Object



155
156
157
# File 'lib/bolognese/readers/schema_org_reader.rb', line 155

def schema_org_is_supplement_to(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_is_supplemented_by(meta) ⇒ Object



159
160
161
# File 'lib/bolognese/readers/schema_org_reader.rb', line 159

def schema_org_is_supplemented_by(meta)
  schema_org_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_references(meta) ⇒ Object



147
148
149
# File 'lib/bolognese/readers/schema_org_reader.rb', line 147

def schema_org_references(meta)
  schema_org_related_identifier(meta, relation_type: "citation")
end


119
120
121
# File 'lib/bolognese/readers/schema_org_reader.rb', line 119

def schema_org_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.fetch(relation_type, nil))
end


123
124
125
# File 'lib/bolognese/readers/schema_org_reader.rb', line 123

def schema_org_reverse_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.dig("@reverse", relation_type))
end