Module: Bolognese::Readers::SchemaOrgReader

Included in:
MetadataUtils
Defined in:
lib/bolognese/readers/schema_org_reader.rb

Constant Summary collapse

SO_TO_DC_RELATION_TYPES =
{
  "citation" => "References",
  "sameAs" => "IsIdenticalTo",
  "isPartOf" => "IsPartOf",
  "hasPart" => "HasPart",
  "isPredecessor" => "IsPreviousVersionOf",
  "isSuccessor" => "IsNewVersionOf"
}

Instance Method Summary collapse

Instance Method Details

#get_schema_org(id: nil, **options) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/bolognese/readers/schema_org_reader.rb', line 13

def get_schema_org(id: nil, **options)
  return { "string" => nil, "state" => "not_found" } unless id.present?

  id = normalize_id(id)
  response = Maremma.get(id)
  doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')

  # workaround for xhtml documents
  nodeset = doc.css("script")
  string = nodeset.find { |element| element["type"] == "application/ld+json" }
  string = string.text if string.present?

  { "string" => string }
end

#read_schema_org(string: nil, **options) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/bolognese/readers/schema_org_reader.rb', line 28

def read_schema_org(string: nil, **options)
  if string.present?
    errors = jsonlint(string)
    return { "errors" => errors } if errors.present?
  end

  meta = string.present? ? Maremma.from_json(string) : {}

  id = normalize_id(meta.fetch("@id", nil) || meta.fetch("identifier", nil))
  type = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
  resource_type_general = Bolognese::Utils::SO_TO_DC_TRANSLATIONS[type]
  authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
  author = get_authors(from_schema_org(Array.wrap(authors)))
  editor = get_authors(from_schema_org(Array.wrap(meta.fetch("editor", nil))))
  publisher = if meta.dig("publisher").is_a?(Hash)
                meta.dig("publisher", "name")
              elsif publisher.is_a?(String)
                meta.dig("publisher")
              end

  included_in_data_catalog = from_schema_org(Array.wrap(meta.fetch("includedInDataCatalog", nil)))
  included_in_data_catalog = Array.wrap(included_in_data_catalog).map { |dc| { "title" => dc["name"], "url" => dc["url"] } }
  is_part_of = schema_org_is_part_of(meta) || included_in_data_catalog

  license = {
    "id" => parse_attributes(meta.fetch("license", nil), content: "id", first: true),
    "name" => parse_attributes(meta.fetch("license", nil), content: "name", first: true)
  }

  date_published = meta.fetch("datePublished", nil)
  state = meta.present? ? "findable" : "not_found"

  { "id" => id,
    "type" => type,
    "additional_type" => meta.fetch("additionalType", nil),
    "citeproc_type" => Bolognese::Utils::SO_TO_CP_TRANSLATIONS[type] || "article-journal",
    "bibtex_type" => Bolognese::Utils::SO_TO_BIB_TRANSLATIONS[type] || "misc",
    "ris_type" => Bolognese::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN",
    "resource_type_general" => resource_type_general,
    "doi" => validate_doi(id),
    "identifier" => id,
    "b_url" => normalize_id(meta.fetch("url", nil)),
    "title" => meta.fetch("name", nil),
    "alternate_name" => meta.fetch("alternateName", nil),
    "author" => author,
    "editor" => editor,
    "publisher" => publisher,
    "service_provider" => meta.fetch("provider", nil),
    "is_identical_to" => schema_org_is_identical_to(meta),
    "is_part_of" => is_part_of,
    "has_part" => schema_org_has_part(meta),
    "references" => schema_org_references(meta),
    "is_referenced_by" => schema_org_is_referenced_by(meta),
    "is_supplement_to" => schema_org_is_supplement_to(meta),
    "is_supplemented_by" => schema_org_is_supplemented_by(meta),
    "date_created" => meta.fetch("dateCreated", nil),
    "date_published" => date_published,
    "date_modified" => meta.fetch("dateModified", nil),
    "description" => meta.fetch("description", nil).present? ? { "text" => sanitize(meta.fetch("description")) } : nil,
    "license" => license,
    "b_version" => meta.fetch("version", nil),
    "keywords" => meta.fetch("keywords", nil).to_s.split(", "),
    "state" => state
  }
end

#schema_org_has_part(meta) ⇒ Object



110
111
112
# File 'lib/bolognese/readers/schema_org_reader.rb', line 110

def schema_org_has_part(meta)
  schema_org_related_identifier(meta, relation_type: "hasPart")
end

#schema_org_is_identical_to(meta) ⇒ Object



102
103
104
# File 'lib/bolognese/readers/schema_org_reader.rb', line 102

def schema_org_is_identical_to(meta)
  schema_org_related_identifier(meta, relation_type: "sameAs")
end

#schema_org_is_new_version_of(meta) ⇒ Object



118
119
120
# File 'lib/bolognese/readers/schema_org_reader.rb', line 118

def schema_org_is_new_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "SuccessorOf")
end

#schema_org_is_part_of(meta) ⇒ Object



106
107
108
# File 'lib/bolognese/readers/schema_org_reader.rb', line 106

def schema_org_is_part_of(meta)
  schema_org_related_identifier(meta, relation_type: "isPartOf")
end

#schema_org_is_previous_version_of(meta) ⇒ Object



114
115
116
# File 'lib/bolognese/readers/schema_org_reader.rb', line 114

def schema_org_is_previous_version_of(meta)
  schema_org_related_identifier(meta, relation_type: "PredecessorOf")
end

#schema_org_is_referenced_by(meta) ⇒ Object



126
127
128
# File 'lib/bolognese/readers/schema_org_reader.rb', line 126

def schema_org_is_referenced_by(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "citation")
end

#schema_org_is_supplement_to(meta) ⇒ Object



130
131
132
# File 'lib/bolognese/readers/schema_org_reader.rb', line 130

def schema_org_is_supplement_to(meta)
  schema_org_reverse_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_is_supplemented_by(meta) ⇒ Object



134
135
136
# File 'lib/bolognese/readers/schema_org_reader.rb', line 134

def schema_org_is_supplemented_by(meta)
  schema_org_related_identifier(meta, relation_type: "isBasedOn")
end

#schema_org_references(meta) ⇒ Object



122
123
124
# File 'lib/bolognese/readers/schema_org_reader.rb', line 122

def schema_org_references(meta)
  schema_org_related_identifier(meta, relation_type: "citation")
end


94
95
96
# File 'lib/bolognese/readers/schema_org_reader.rb', line 94

def schema_org_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.fetch(relation_type, nil))
end


98
99
100
# File 'lib/bolognese/readers/schema_org_reader.rb', line 98

def schema_org_reverse_related_identifier(meta, relation_type: nil)
  normalize_ids(ids: meta.dig("@reverse", relation_type))
end