Module: Bolognese::Readers::SchemaOrgReader

Included in:
MetadataUtils
Defined in:
lib/bolognese/readers/schema_org_reader.rb

Constant Summary collapse

SO_TO_DC_RELATION_TYPES =
{
  "citation" => "References",
  "sameAs" => "IsIdenticalTo",
  "isPartOf" => "IsPartOf",
  "hasPart" => "HasPart",
  "isPredecessor" => "IsPreviousVersionOf",
  "isSuccessor" => "IsNewVersionOf"
}

Instance Method Summary collapse

Instance Method Details

#get_schema_org(id: nil, **options) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/bolognese/readers/schema_org_reader.rb', line 15

def get_schema_org(id: nil, **options)
  return { "string" => nil, "state" => "not_found" } unless id.present?

  id = normalize_id(id)
  response = Maremma.get(id)
  doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')

  # workaround for xhtml documents
  nodeset = doc.css("script")
  string = nodeset.find { |element| element["type"] == "application/ld+json" }
  string = string.text if string.present?

  { "string" => string }
end

#read_schema_org(string: nil, **options) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/bolognese/readers/schema_org_reader.rb', line 30

def read_schema_org(string: nil, **options)
  if string.present?
    errors = jsonlint(string)
    return { "errors" => errors } if errors.present?
  end

  meta = string.present? ? Maremma.from_json(string) : {}

  identifier = Array.wrap(meta.fetch("identifier", nil))
  if identifier.length > 1
    alternate_identifier = identifier[1..-1].map do |r|
      if r.is_a?(String)
        { "type" => "URL", "name" => r }
      elsif r.is_a?(Hash)
        { "type" => r["propertyID"], "name" => r["value"] }
      end
    end.unwrap
  else
    alternate_identifier = nil
  end
  identifier = identifier.first

  id = normalize_id(meta.fetch("@id", nil) || meta.fetch("identifier", nil))
  type = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
  resource_type_general = Bolognese::Utils::SO_TO_DC_TRANSLATIONS[type]
  authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
  author = get_authors(from_schema_org(Array.wrap(authors)))
  editor = get_authors(from_schema_org(Array.wrap(meta.fetch("editor", nil))))
  publisher = if meta.dig("publisher").is_a?(Hash)
                meta.dig("publisher", "name")
              elsif publisher.is_a?(String)
                meta.dig("publisher")
              end

  included_in_data_catalog = from_schema_org(Array.wrap(meta.fetch("includedInDataCatalog", nil)))
  included_in_data_catalog = Array.wrap(included_in_data_catalog).reduce([]) do |sum, dc| 
    sum << { "title" => dc["name"], "url" => dc["url"] } if dc["url"].present?
    sum
  end.unwrap
  is_part_of = schema_org_is_part_of(meta) || included_in_data_catalog

  license = {
    "id" => parse_attributes(meta.fetch("license", nil), content: "id", first: true),
    "name" => parse_attributes(meta.fetch("license", nil), content: "name", first: true)
  }

  funding = from_schema_org(Array.wrap(meta.fetch("funding", nil)))
  date_published = meta.fetch("datePublished", nil)
  state = meta.present? ? "findable" : "not_found"
  
  ct = (type == "Dataset") ? "includedInDataCatalog" : "Periodical"
  container_title = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "name", first: true)

  { "id" => id,
    "type" => type,
    "additional_type" => meta.fetch("additionalType", nil),
    "citeproc_type" => Bolognese::Utils::SO_TO_CP_TRANSLATIONS[type] || "article-journal",
    "bibtex_type" => Bolognese::Utils::SO_TO_BIB_TRANSLATIONS[type] || "misc",
    "ris_type" => Bolognese::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN",
    "resource_type_general" => resource_type_general,
    "doi" => validate_doi(id),
    "identifier" => identifier,
    "alternate_identifier" => alternate_identifier,
    "b_url" => normalize_id(meta.fetch("url", nil)),
    "content_url" => Array.wrap(meta.fetch("contentUrl", nil)).unwrap,
    "title" => meta.fetch("name", nil),
    "author" => author,
    "editor" => editor,
    "publisher" => publisher,
    "service_provider" => parse_attributes(meta.fetch("provider", nil), content: "name", first: true),
    "container_title" => container_title,
    "is_identical_to" => schema_org_is_identical_to(meta),
    "is_part_of" => is_part_of,
    "has_part" => schema_org_has_part(meta),
    "references" => schema_org_references(meta),
    "is_referenced_by" => schema_org_is_referenced_by(meta),
    "is_supplement_to" => schema_org_is_supplement_to(meta),
    "is_supplemented_by" => schema_org_is_supplemented_by(meta),
    "date_created" => meta.fetch("dateCreated", nil),
    "date_published" => date_published,
    "date_modified" => meta.fetch("dateModified", nil),
    "description" => meta.fetch("description", nil).present? ? { "text" => sanitize(meta.fetch("description")) } : nil,
    "license" => license,
    "b_version" => meta.fetch("version", nil),
    "keywords" => meta.fetch("keywords", nil).to_s.split(", "),
    "state" => state,
    "schema_version" => meta.fetch("schemaVersion", nil),
    "funding" => funding
  }
end

#schema_org_has_part(meta) ⇒ Object



137
138
139
# File 'lib/bolognese/readers/schema_org_reader.rb', line 137

def schema_org_has_part(meta)
  schema_org_related_identifier(meta, relation_type: "hasPart")
end

#schema_org_is_identical_to(meta) ⇒ Object



129
130
131
# File 'lib/bolognese/readers/schema_org_reader.rb', line 129

def schema_org_is_identical_to(meta)
  schema_org_related_identifier(meta, relation_type: "sameAs")
end

#schema_org_is_new_version_of(meta) ⇒ Object



145
146
147
# File 'lib/bolognese/readers/schema_org_reader.rb', line 145

def schema_org_is_new_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "SuccessorOf")
end

#schema_org_is_part_of(meta) ⇒ Object



133
134
135
# File 'lib/bolognese/readers/schema_org_reader.rb', line 133

def schema_org_is_part_of(meta)
  schema_org_related_identifier(meta, relation_type: "isPartOf")
end

#schema_org_is_previous_version_of(meta) ⇒ Object



141
142
143
# File 'lib/bolognese/readers/schema_org_reader.rb', line 141

def schema_org_is_previous_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "PredecessorOf")
end

#schema_org_is_referenced_by(meta) ⇒ Object



153
154
155
# File 'lib/bolognese/readers/schema_org_reader.rb', line 153

def schema_org_is_referenced_by(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "citation")
end

#schema_org_is_supplement_to(meta) ⇒ Object



157
158
159
# File 'lib/bolognese/readers/schema_org_reader.rb', line 157

def schema_org_is_supplement_to(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_is_supplemented_by(meta) ⇒ Object



161
162
163
# File 'lib/bolognese/readers/schema_org_reader.rb', line 161

def schema_org_is_supplemented_by(meta)
  schema_org_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_references(meta) ⇒ Object



149
150
151
# File 'lib/bolognese/readers/schema_org_reader.rb', line 149

def schema_org_references(meta)
  schema_org_related_identifier(meta, relation_type: "citation")
end


121
122
123
# File 'lib/bolognese/readers/schema_org_reader.rb', line 121

def schema_org_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.fetch(relation_type, nil))
end


125
126
127
# File 'lib/bolognese/readers/schema_org_reader.rb', line 125

def schema_org_reverse_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.dig("@reverse", relation_type))
end