Module: Commonmeta::Utils

Included in:: CLI, MetadataUtils

Defined in:: lib/commonmeta/utils.rb

Constant Summary collapse

NORMALIZED_LICENSES =

{
  "https://creativecommons.org/licenses/by/1.0" => "https://creativecommons.org/licenses/by/1.0/legalcode",
  "https://creativecommons.org/licenses/by/2.0" => "https://creativecommons.org/licenses/by/2.0/legalcode",
  "https://creativecommons.org/licenses/by/2.5" => "https://creativecommons.org/licenses/by/2.5/legalcode",
  "https://creativecommons.org/licenses/by/3.0" => "https://creativecommons.org/licenses/by/3.0/legalcode",
  "https://creativecommons.org/licenses/by/3.0/us" => "https://creativecommons.org/licenses/by/3.0/legalcode",
  "https://creativecommons.org/licenses/by/4.0" => "https://creativecommons.org/licenses/by/4.0/legalcode",
  "https://creativecommons.org/licenses/by-nc/1.0" => "https://creativecommons.org/licenses/by-nc/1.0/legalcode",
  "https://creativecommons.org/licenses/by-nc/2.0" => "https://creativecommons.org/licenses/by-nc/2.0/legalcode",
  "https://creativecommons.org/licenses/by-nc/2.5" => "https://creativecommons.org/licenses/by-nc/2.5/legalcode",
  "https://creativecommons.org/licenses/by-nc/3.0" => "https://creativecommons.org/licenses/by-nc/3.0/legalcode",
  "https://creativecommons.org/licenses/by-nc/4.0" => "https://creativecommons.org/licenses/by-nc/4.0/legalcode",
  "https://creativecommons.org/licenses/by-nd-nc/1.0" => "https://creativecommons.org/licenses/by-nd-nc/1.0/legalcode",
  "https://creativecommons.org/licenses/by-nd-nc/2.0" => "https://creativecommons.org/licenses/by-nd-nc/2.0/legalcode",
  "https://creativecommons.org/licenses/by-nd-nc/2.5" => "https://creativecommons.org/licenses/by-nd-nc/2.5/legalcode",
  "https://creativecommons.org/licenses/by-nd-nc/3.0" => "https://creativecommons.org/licenses/by-nd-nc/3.0/legalcode",
  "https://creativecommons.org/licenses/by-nd-nc/4.0" => "https://creativecommons.org/licenses/by-nd-nc/4.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-sa/1.0" => "https://creativecommons.org/licenses/by-nc-sa/1.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-sa/2.0" => "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-sa/2.5" => "https://creativecommons.org/licenses/by-nc-sa/2.5/legalcode",
  "https://creativecommons.org/licenses/by-nc-sa/3.0" => "https://creativecommons.org/licenses/by-nc-sa/3.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-sa/4.0" => "https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode",
  "https://creativecommons.org/licenses/by-nd/1.0" => "https://creativecommons.org/licenses/by-nd/1.0/legalcode",
  "https://creativecommons.org/licenses/by-nd/2.0" => "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
  "https://creativecommons.org/licenses/by-nd/2.5" => "https://creativecommons.org/licenses/by-nd/2.5/legalcode",
  "https://creativecommons.org/licenses/by-nd/3.0" => "https://creativecommons.org/licenses/by-nd/3.0/legalcode",
  "https://creativecommons.org/licenses/by-nd/4.0" => "https://creativecommons.org/licenses/by-nd/2.0/legalcode",
  "https://creativecommons.org/licenses/by-sa/1.0" => "https://creativecommons.org/licenses/by-sa/1.0/legalcode",
  "https://creativecommons.org/licenses/by-sa/2.0" => "https://creativecommons.org/licenses/by-sa/2.0/legalcode",
  "https://creativecommons.org/licenses/by-sa/2.5" => "https://creativecommons.org/licenses/by-sa/2.5/legalcode",
  "https://creativecommons.org/licenses/by-sa/3.0" => "https://creativecommons.org/licenses/by-sa/3.0/legalcode",
  "https://creativecommons.org/licenses/by-sa/4.0" => "https://creativecommons.org/licenses/by-sa/4.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-nd/1.0" => "https://creativecommons.org/licenses/by-nc-nd/1.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-nd/2.0" => "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-nd/2.5" => "https://creativecommons.org/licenses/by-nc-nd/2.5/legalcode",
  "https://creativecommons.org/licenses/by-nc-nd/3.0" => "https://creativecommons.org/licenses/by-nc-nd/3.0/legalcode",
  "https://creativecommons.org/licenses/by-nc-nd/4.0" => "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode",
  "https://creativecommons.org/licenses/publicdomain" => "https://creativecommons.org/licenses/publicdomain/",
  "https://creativecommons.org/publicdomain/zero/1.0" => "https://creativecommons.org/publicdomain/zero/1.0/legalcode",
}

BIB_TO_CM_TRANSLATIONS = source: www.bibtex.com/e/entry-types/

{
  "article" => "JournalArticle",
  "book" => "Book",
  "booklet" => "Book",
  "inbook" => "BookChapter",
  "inproceedings" => "ProceedingsArticle",
  "manual" => "Report",
  "mastersthesis" => "Dissertation",
  "misc" => "Other",
  "phdthesis" => "Dissertation",
  "proceedings" => "Proceedings",
  "techreport" => "Report",
  "unpublished" => "Manuscript",
}

CM_TO_BIB_TRANSLATIONS =

{
  "Article" => "article",
  "Book" => "book",
  "BookChapter" => "inbook",
  "Dissertation" => "phdthesis",
  "JournalArticle" => "article",
  "Manuscript" => "unpublished",
  "Other" => "misc",
  "Proceedings" => "proceedings",
  "ProceedingsArticle" => "inproceedings",
  "Report" => "techreport",
}

CSL_TO_CM_TRANSLATIONS = source: docs.citationstyles.org/en/stable/specification.html?highlight=book#appendix-iii-types

{
  "article" => "Article",
  "article-journal" => "JournalArticle",
  "article-magazine" => "Article",
  "article-newspaper" => "Article",
  "bill" => "LegalDocument",
  "book" => "Book",
  "broadcast" => "Audiovisual",
  "chapter" => "BookChapter",
  "classic" => "Book",
  "collection" => "Collection",
  "dataset" => "Dataset",
  "document" => "Document",
  "entry" => "Entry",
  "entry-dictionary" => "Entry",
  "entry-encyclopedia" => "Entry",
  "event" => "Event",
  "figure" => "Figure",
  "graphic" => "Image",
  "hearing" => "LegalDocument",
  "interview" => "Document",
  "legal_case" => "LegalDocument",
  "legislation" => "LegalDocument",
  "manuscript" => "Manuscript",
  "map" => "Map",
  "motion_picture" => "Audiovisual",
  "musical_score" => "Document",
  "pamphlet" => "Document",
  "paper-conference" => "ProceedingsArticle",
  "patent" => "Patent",
  "performance" => "Performance",
  "periodical" => "Journal",
  "personal_communication" => "PersonalCommunication",
  "post" => "Post",
  "post-weblog" => "Article",
  "regulation" => "LegalDocument",
  "report" => "Report",
  "review" => "Review",
  "review-book" => "Review",
  "software" => "Software",
  "song" => "Audiovisual",
  "speech" => "Speech",
  "standard" => "Standard",
  "thesis" => "Dissertation",
  "treaty" => "LegalDocument",
  "webpage" => "WebPage",
}

CM_TO_CSL_TRANSLATIONS =

{
  "Article" => "article",
  "JournalArticle" => "article-journal",
  "Book" => "book",
  "BookChapter" => "chapter",
  "Collection" => "collection",
  "Dataset" => "dataset",
  "Document" => "document",
  "Entry" => "entry",
  "Event" => "event",
  "Figure" => "figure",
  "Image" => "graphic",
  "LegalDocument" => "legal_case",
  "Manuscript" => "manuscript",
  "Map" => "map",
  "Audiovisual" => "motion_picture",
  "Patent" => "patent",
  "Performance" => "performance",
  "Journal" => "periodical",
  "PersonalCommunication" => "personal_communication",
  "Post" => "post",
  "Report" => "report",
  "Review" => "review",
  "Software" => "software",
  "Speech" => "speech",
  "Standard" => "standard",
  "Dissertation" => "thesis",
  "WebPage" => "webpage",
}

CR_TO_CM_TRANSLATIONS = source: api.crossref.org/types

{
  "BookChapter" => "BookChapter",
  "BookPart" => "BookPart",
  "BookSection" => "BookSection",
  "BookSeries" => "BookSeries",
  "BookSet" => "BookSet",
  "BookTrack" => "BookTrack",
  "Book" => "Book",
  "Component" => "Component",
  "Database" => "Database",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "EditedBook" => "EditedBook",
  "Grant" => "Grant",
  "JournalArticle" => "JournalArticle",
  "JournalIssue" => "JournalIssue",
  "JournalVolume" => "JournalVolume",
  "Journal" => "Journal",
  "Monograph" => "Book",
  "Other" => "Other",
  "PeerReview" => "PeerReview",
  "PostedContent" => "Article",
  "ProceedingsArticle" => "ProceedingsArticle",
  "ProceedingsSeries" => "ProceedingsSeries",
  "Proceedings" => "Proceedings",
  "ReferenceBook" => "ReferenceBook",
  "ReferenceEntry" => "Entry",
  "ReportComponent" => "ReportComponent",
  "ReportSeries" => "ReportSeries",
  "Report" => "Report",
  "Standard" => "Standard",
}

CM_TO_CR_TRANSLATIONS =

{
  "Article" => "PostedContent",
  "BookChapter" => "BookChapter",
  "BookSeries" => "BookSeries",
  "Book" => "Book",
  "Component" => "Component",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "Grant" => "Grant",
  "JournalArticle" => "JournalArticle",
  "JournalIssue" => "JournalIssue",
  "JournalVolume" => "JournalVolume",
  "Journal" => "Journal",
  "ProceedingsArticle" => "ProceedingsArticle",
  "ProceedingsSeries" => "ProceedingsSeries",
  "Proceedings" => "Proceedings",
  "ReportComponent" => "ReportComponent",
  "ReportSeries" => "ReportSeries",
  "Report" => "Report",
  "PeerReview" => "PeerReview",
  "Other" => "Other",
}

DC_TO_CM_TRANSLATIONS = source: github.com/datacite/schema/blob/master/source/meta/kernel-4/include/datacite-resourceType-v4.xsd

{
  "Audiovisual" => "Audiovisual",
  "BlogPosting" => "Article",
  "Book" => "Book",
  "BookChapter" => "BookChapter",
  "Collection" => "Collection",
  "ComputationalNotebook" => "ComputationalNotebook",
  "ConferencePaper" => "ProceedingsArticle",
  "ConferenceProceeding" => "Proceedings",
  "DataPaper" => "JournalArticle",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "Event" => "Event",
  "Image" => "Image",
  "InteractiveResource" => "InteractiveResource",
  "Journal" => "Journal",
  "JournalArticle" => "JournalArticle",
  "Model" => "Model",
  "OutputManagementPlan" => "OutputManagementPlan",
  "PeerReview" => "PeerReview",
  "PhysicalObject" => "PhysicalObject",
  "Poster" => "Speech",
  "Preprint" => "Article",
  "Report" => "Report",
  "Service" => "Service",
  "Software" => "Software",
  "Sound" => "Sound",
  "Standard" => "Standard",
  "Text" => "Document",
  "Thesis" => "Dissertation",
  "Workflow" => "Workflow",
  "Other" => "Other",
}

CM_TO_DC_TRANSLATIONS =

{
  "Article" => "Preprint",
  "Audiovisual" => "Audiovisual",
  "Book" => "Book",
  "BookChapter" => "BookChapter",
  "Collection" => "Collection",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "Document" => "Text",
  "Entry" => "Text",
  "Event" => "Event",
  "Figure" => "Image",
  "Image" => "Image",
  "JournalArticle" => "JournalArticle",
  "LegalDocument" => "Text",
  "Manuscript" => "Text",
  "Map" => "Image",
  "Patent" => "Text",
  "Performance" => "Audiovisual",
  "PersonalCommunication" => "Text",
  "Post" => "Text",
  "ProceedingsArticle" => "ConferencePaper",
  "Proceedings" => "ConferenceProceeding",
  "Report" => "Report",
  "PeerReview" => "PeerReview",
  "Software" => "Software",
  "Sound" => "Sound",
  "Standard" => "Standard",
  "WebPage" => "Text",
}

RIS_TO_CM_TRANSLATIONS =

{
  "ABST" => "Text",
  "ADVS" => "Text",
  "AGGR" => "Text",
  "ANCIENT" => "Text",
  "ART" => "Text",
  "BILL" => "Text",
  "BLOG" => "Text",
  "BOOK" => "Book",
  "CASE" => "Text",
  "CHAP" => "BookChapter",
  "CHART" => "Text",
  "CLSWK" => "Text",
  "CTLG" => "Collection",
  "COMP" => "Software",
  "DATA" => "Dataset",
  "DBASE" => "Database",
  "DICT" => "Dictionary",
  "EBOOK" => "Book",
  "ECHAP" => "BookChapter",
  "EDBOOK" => "Book",
  "EJOUR" => "JournalArticle",
  "ELEC" => "Text",
  "ENCYC" => "Encyclopedia",
  "EQUA" => "Equation",
  "FIGURE" => "Image",
  "GEN" => "CreativeWork",
  "GOVDOC" => "GovernmentDocument",
  "GRANT" => "Grant",
  "HEAR" => "Hearing",
  "ICOMM" => "Text",
  "INPR" => "Text",
  "JFULL" => "JournalArticle",
  "JOUR" => "JournalArticle",
  "LEGAL" => "LegalRuleOrRegulation",
  "MANSCPT" => "Text",
  "MAP" => "Map",
  "MGZN" => "MagazineArticle",
  "MPCT" => "Audiovisual",
  "MULTI" => "Audiovisual",
  "MUSIC" => "MusicScore",
  "NEWS" => "NewspaperArticle",
  "PAMP" => "Pamphlet",
  "PAT" => "Patent",
  "PCOMM" => "PersonalCommunication",
  "RPRT" => "Report",
  "SER" => "SerialPublication",
  "SLIDE" => "Slide",
  "SOUND" => "SoundRecording",
  "STAND" => "Standard",
  "THES" => "Dissertation",
  "UNBILL" => "UnenactedBill",
  "UNPB" => "UnpublishedWork",
  "VIDEO" => "Audiovisual",
  "WEB" => "WebPage",
}

CM_TO_RIS_TRANSLATIONS =

{
  "Article" => "JOUR",
  "Audiovisual" => "VIDEO",
  "Book" => "BOOK",
  "BookChapter" => "CHAP",
  "Collection" => "CTLG",
  "Dataset" => "DATA",
  "Dissertation" => "THES",
  "Document" => "GEN",
  "Entry" => "DICT",
  "Event" => "GEN",
  "Figure" => "FIGURE",
  "Image" => "FIGURE",
  "JournalArticle" => "JOUR",
  "LegalDocument" => "GEN",
  "Manuscript" => "GEN",
  "Map" => "MAP",
  "Patent" => "PAT",
  "Performance" => "GEN",
  "PersonalCommunication" => "PCOMM",
  "Post" => "GEN",
  "ProceedingsArticle" => "CPAPER",
  "Proceedings" => "CONF",
  "Report" => "RPRT",
  "Review" => "GEN",
  "Software" => "COMP",
  "Sound" => "SOUND",
  "Standard" => "STAND",
  "WebPage" => "WEB",
}

SO_TO_CM_TRANSLATIONS =

{
  "Article" => "Article",
  "BlogPosting" => "Article",
  "Book" => "Book",
  "BookChapter" => "BookChapter",
  "CreativeWork" => "Other",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "NewsArticle" => "Article",
  "Legislation" => "LegalDocument",
  "ScholarlyArticle" => "JournalArticle",
  "SoftwareSourceCode" => "Software",
}

CM_TO_SO_TRANSLATIONS =

{
  "Article" => "Article",
  "Audiovisual" => "CreativeWork",
  "Book" => "Book",
  "BookChapter" => "BookChapter",
  "Collection" => "CreativeWork",
  "Dataset" => "Dataset",
  "Dissertation" => "Dissertation",
  "Document" => "CreativeWork",
  "Entry" => "CreativeWork",
  "Event" => "CreativeWork",
  "Figure" => "CreativeWork",
  "Image" => "CreativeWork",
  "JournalArticle" => "ScholarlyArticle",
  "LegalDocument" => "Legislation",
  "Software" => "SoftwareSourceCode",
}

CM_TO_JATS_TRANSLATIONS =

{
  "Proceedings" => "working-paper",
  "ReferenceBook" => "book",
  "JournalIssue" => "journal",
  "ProceedingsArticle" => "working-paper",
  "Other" => nil,
  "Dissertation" => nil,
  "Dataset" => "data",
  "Document" => "journal",
  "EditedBook" => "book",
  "JournalArticle" => "journal",
  "Journal" => "journal",
  "Report" => "report",
  "BookSeries" => "book",
  "ReportSeries" => "report",
  "BookTrack" => "book",
  "Standard" => "standard",
  "BookSection" => "chapter",
  "BookPart" => "chapter",
  "Book" => "book",
  "BookChapter" => "chapter",
  "StandardSeries" => "standard",
  "Monograph" => "book",
  "Component" => nil,
  "ReferenceEntry" => nil,
  "JournalVolume" => "journal",
  "BookSet" => "book",
  "Article" => "journal",
  "Software" => "software",
}

UNKNOWN_INFORMATION =

{
  ":unac" => "temporarily inaccessible",
  ":unal" => "unallowed, suppressed intentionally",
  ":unap" => "not applicable, makes no sense",
  ":unas" => "value unassigned (e.g., Untitled)",
  ":unav" => "value unavailable, possibly unknown",
  ":unkn" => "known to be unknown (e.g., Anonymous, Inconnue)",
  ":none" => "never had a value, never will",
  ":null" => "explicitly and meaningfully empty",
  ":tba" => "to be assigned or announced later",
  ":etal" => "too numerous to list (et alia)",
}

Instance Method Summary collapse

#decode_container_id(id) ⇒ Object
#decode_doi(doi) ⇒ Object
#encode_container_id ⇒ Object
#encode_doi(prefix) ⇒ Object
#find_from_format(id: nil, string: nil, ext: nil, filename: nil) ⇒ Object
#find_from_format_by_ext(string, options = {}) ⇒ Object
#find_from_format_by_filename(filename) ⇒ Object
#find_from_format_by_id(id) ⇒ Object
#find_from_format_by_string(string) ⇒ Object
#from_csl(element) ⇒ Object
#from_datacite(element) ⇒ Object
#from_json_feed(element) ⇒ Object
#from_schema_org(element) ⇒ Object
#get_contributor(contributor, contributor_type) ⇒ Object
#get_date(dates, date_type) ⇒ Object
#get_date_from_date_parts(date_as_parts) ⇒ Object
#get_date_from_parts(year, month = nil, day = nil) ⇒ Object
#get_date_parts(iso8601_time) ⇒ Object
#get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object
#get_dates_from_date(date) ⇒ Object

convert commonmeta dates to DataCite format.
#get_datetime_from_iso8601(iso8601_time) ⇒ Object

parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library, so we use the edtf gem return nil if invalid iso8601 timestamp.
#get_datetime_from_time(time) ⇒ Object

iso8601 datetime without hyphens and colons, used by Crossref return nil if invalid.
#get_identifier(identifiers, identifier_type) ⇒ Object
#get_identifier_type(identifier_type) ⇒ Object
#get_iso8601_date(iso8601_time) ⇒ Object
#get_link(links, link_type) ⇒ Object
#get_series_information(str) ⇒ Object
#get_year_month(iso8601_time) ⇒ Object
#get_year_month_day(iso8601_time) ⇒ Object
#github_as_cff_url(url) ⇒ Object
#github_as_codemeta_url(url) ⇒ Object
#github_as_owner_url(url) ⇒ Object
#github_as_release_url(url) ⇒ Object
#github_as_repo_url(url) ⇒ Object
#github_from_url(url) ⇒ Object
#github_owner_from_url(url) ⇒ Object
#github_release_from_url(url) ⇒ Object
#github_repo_from_url(url) ⇒ Object
#hsh_to_fos(hsh) ⇒ Object
#hsh_to_spdx(hsh) ⇒ Object
#json_feed_url(id = nil) ⇒ Object
#jsonlint(json) ⇒ Object
#map_hash_keys(element: nil, mapping: nil) ⇒ Object
#name_to_fos(name) ⇒ Object
#name_to_spdx(name) ⇒ Object
#normalize_cc_url(id) ⇒ Object
#normalize_id(id, options = {}) ⇒ Object
#normalize_issn(input, options = {}) ⇒ Object

pick electronic issn if there are multiple format issn as xxxx-xxxx.
#normalize_licenses(licenses) ⇒ Object

find Creative Commons or OSI license in licenses array, normalize url and name.
#normalize_orcid(orcid) ⇒ Object
#normalize_url(id, options = {}) ⇒ Object
#orcid_as_url(orcid) ⇒ Object
#orcid_from_url(url) ⇒ Object
#parse_attributes(element, options = {}) ⇒ Object
#rogue_scholar_api_url(id, _options = {}) ⇒ Object
#sanitize(text, options = {}) ⇒ Object
#spdx_to_hsh(hsh) ⇒ Object
#strip_milliseconds(iso8601_time) ⇒ Object

strip milliseconds if there is a time, as it interferes with edtc parsing keep dates unchanged.
#to_csl(element) ⇒ Object
#to_datacite(element, options = {}) ⇒ Object
#to_identifier(identifier) ⇒ Object
#to_ris(element) ⇒ Object
#to_schema_org(element) ⇒ Object
#to_schema_org_citation(reference) ⇒ Object
#to_schema_org_container(element, options = {}) ⇒ Object
#to_schema_org_funder(funding_references) ⇒ Object
#to_schema_org_identifiers(element, _options = {}) ⇒ Object
#to_schema_org_relation(related_identifiers: nil, relation_type: nil) ⇒ Object
#to_schema_org_spatial_coverage(geo_location) ⇒ Object
#validate_orcid(orcid) ⇒ Object
#validate_orcid_scheme(orcid_scheme) ⇒ Object
#validate_url(str) ⇒ Object

Instance Method Details

#decode_container_id(id) ⇒ `Object`



1380
1381
1382

# File 'lib/commonmeta/utils.rb', line 1380

def decode_container_id(id)
  Base32::URL.decode(id)
end

#decode_doi(doi) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1368

def decode_doi(doi)
  suffix = doi.split("/", 5).last
  Base32::URL.decode(suffix)
end

#encode_container_id ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1373

def encode_container_id
  # suffix has 5 digits plus two checksum digits. With base32 there are
  # 32 possible digits, so 5 digits gives 32^5 possible combinations
  random_int = SecureRandom.random_number(32 ** 4..(32 ** 5) - 1)
  Base32::URL.encode(random_int, checksum: true)
end

#encode_doi(prefix) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1358

def encode_doi(prefix)
  # DOI suffix is a generated from a random number, encoded in base32
  # suffix has 8 digits plus two checksum digits. With base32 there are
  # 32 possible digits, so 8 digits gives 32^8 possible combinations
  random_int = SecureRandom.random_number(32 ** 7..(32 ** 8) - 1)
  suffix = Base32::URL.encode(random_int, checksum: true)
  str = "#{suffix[0, 5]}-#{suffix[5, 10]}"
  "https://doi.org/#{prefix}/#{str}"
end

#find_from_format(id: nil, string: nil, ext: nil, filename: nil) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 441

def find_from_format(id: nil, string: nil, ext: nil, filename: nil)
  if id.present?
    find_from_format_by_id(id)
  elsif string.present? && ext.present?
    find_from_format_by_ext(string, ext: ext)
  elsif string.present?
    find_from_format_by_string(string)
  elsif filename.present?
    find_from_format_by_filename(filename)
  else
    "datacite"
  end
end

#find_from_format_by_ext(string, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 486

def find_from_format_by_ext(string, options = {})
  case options[:ext]
  when ".bib"
    "bibtex"
  when ".ris"
    "ris"
  when ".xml", ".json"
    find_from_format_by_string(string)
  end
end

#find_from_format_by_filename(filename) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 478

def find_from_format_by_filename(filename)
  if filename == "package.json"
    "npm"
  elsif filename == "CITATION.cff"
    "cff"
  end
end

#find_from_format_by_id(id) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 455

def find_from_format_by_id(id)
  id = normalize_id(id)

  if %r{\A(?:(http|https):/(/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(id)
    ra = get_doi_ra(id)
    %w[DataCite Crossref mEDRA KISTI JaLC OP].include?(ra) ? ra.downcase : nil
  elsif %r{\A(?:(http|https):/(/)?orcid\.org/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z}.match?(id)
    "orcid"
  elsif %r{\A(http|https):/(/)?github\.com/(.+)/package.json\z}.match?(id)
    "npm"
  elsif %r{\A(http|https):/(/)?github\.com/(.+)/codemeta.json\z}.match?(id)
    "codemeta"
  elsif %r{\A(http|https):/(/)?github\.com/(.+)/CITATION.cff\z}.match?(id)
    "cff"
  elsif %r{\A(http|https):/(/)?github\.com/(.+)\z}.match?(id)
    "cff"
  elsif %r{\A(http|https):/(/)?rogue-scholar\.org/api/posts/(.+)\z}.match?(id)
    "json_feed_item"
  else
    "schema_org"
  end
end

#find_from_format_by_string(string) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 497

def find_from_format_by_string(string)
  begin # try to parse as JSON
    hsh = MultiJson.load(string).to_h
    if hsh.dig("@context") && URI.parse(hsh.dig("@context")).host == "schema.org"
      return "schema_org"
    elsif hsh.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel")
      return "datacite"
    elsif hsh.dig("source") == "Crossref"
      return "crossref"
    elsif hsh.dig("issued", "date-parts").present?
      return "csl"
    elsif URI.parse(hsh.dig("@context")).to_s == "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld"
      return "codemeta"
    end
  rescue MultiJson::ParseError
  end

  begin # try to parse as XML
    hsh = Hash.from_xml(string)
    return "crossref_xml" if hsh.to_h.dig("crossref_result").present?
  rescue Nokogiri::XML::SyntaxError
  end

  begin # try to parse as YAML
    hsh = YAML.load(string, permitted_classes: [Date])
    return "cff" if hsh.is_a?(Hash) && hsh.fetch("cff-version", nil).present?
  rescue Psych::SyntaxError
  end

  if string.start_with?("TY  - ")
    "ris"
  elsif BibTeX.parse(string).first
    "bibtex"
  end
end

#from_csl(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 886

def from_csl(element)
  Array.wrap(element).map do |a|
    if a["literal"].present?
      a["type"] = "Organization"
      a["name"] = a["literal"]
    elsif a["name"].present?
      a["type"] = "Organization"
    elsif a["given"].present? || a["family"].present?
      a["type"] = "Person"
    end
    a["givenName"] = a["given"]
    a["familyName"] = a["family"]
    a.except("given", "family", "literal").compact
  end.unwrap
end

#from_datacite(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 697

def from_datacite(element)
  mapping = { "nameType" => "type", "creatorName" => "name" }

  map_hash_keys(element: element, mapping: mapping)
end

#from_json_feed(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 880

def from_json_feed(element)
  mapping = { "url" => "id" }

  map_hash_keys(element: element, mapping: mapping)
end

#from_schema_org(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 840

def from_schema_org(element)
  mapping = { "@type" => "type", "@id" => "id" }

  map_hash_keys(element: element, mapping: mapping)
end

#get_contributor(contributor, contributor_type) ⇒ `Object`



1120
1121
1122

# File 'lib/commonmeta/utils.rb', line 1120

def get_contributor(contributor, contributor_type)
  contributor.select { |c| c["contributorType"] == contributor_type }
end

#get_date(dates, date_type) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1092

def get_date(dates, date_type)
  dd = Array.wrap(dates).find { |d| d["dateType"] == date_type } || {}
  dd.fetch("date", nil)
end

#get_date_from_date_parts(date_as_parts) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1009

def get_date_from_date_parts(date_as_parts)
  date_parts = date_as_parts.fetch("date-parts", []).first
  return nil if date_parts == [nil]

  year = date_parts[0]
  month = date_parts[1]
  day = date_parts[2]
  get_date_from_parts(year, month, day)
rescue NoMethodError # if date_parts is nil
  nil
end

#get_date_from_parts(year, month = nil, day = nil) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1021

def get_date_from_parts(year, month = nil, day = nil)
  [year.to_s.rjust(4, "0"), month.to_s.rjust(2, "0"), day.to_s.rjust(2, "0")].reject do |part|
    part == "00"
  end.join("-")
end

#get_date_parts(iso8601_time) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 998

def get_date_parts(iso8601_time)
  return { "date-parts" => [[]] } if iso8601_time.nil?

  year = iso8601_time[0..3].to_i
  month = iso8601_time[5..6].to_i
  day = iso8601_time[8..9].to_i
  { "date-parts" => [[year, month, day].reject { |part| part == 0 }] }
rescue TypeError
  nil
end

#get_date_parts_from_parts(year, month = nil, day = nil) ⇒ `Object`



1027
1028
1029

# File 'lib/commonmeta/utils.rb', line 1027

def get_date_parts_from_parts(year, month = nil, day = nil)
  { "date-parts" => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] }
end

#get_dates_from_date(date) ⇒ `Object`

convert commonmeta dates to DataCite format

# File 'lib/commonmeta/utils.rb', line 1107

def get_dates_from_date(date)
  return nil if date.nil?

  mapping = { "published" => "issued" }

  date = map_hash_keys(element: date, mapping: mapping)

  date.map do |k, v|
    { "date" => v,
      "dateType" => k.capitalize }
  end
end

#get_datetime_from_iso8601(iso8601_time) ⇒ `Object`

parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library, so we use the edtf gem return nil if invalid iso8601 timestamp

# File 'lib/commonmeta/utils.rb', line 1066

def get_datetime_from_iso8601(iso8601_time)
  Date.edtf(iso8601_time).to_time.utc
rescue StandardError
  nil
end

#get_datetime_from_time(time) ⇒ `Object`

iso8601 datetime without hyphens and colons, used by Crossref return nil if invalid

# File 'lib/commonmeta/utils.rb', line 1086

def get_datetime_from_time(time)
  DateTime.strptime(time.to_s, "%Y%m%d%H%M%S").strftime("%Y-%m-%dT%H:%M:%SZ")
rescue ArgumentError
  nil
end

#get_identifier(identifiers, identifier_type) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1124

def get_identifier(identifiers, identifier_type)
  id = Array.wrap(identifiers).find { |i| i["identifierType"] == identifier_type } || {}
  id.fetch("identifier", nil)
end

#get_identifier_type(identifier_type) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1129

def get_identifier_type(identifier_type)
  return nil unless identifier_type.present?

  identifierTypes = {
    "ark" => "ARK",
    "arxiv" => "arXiv",
    "bibcode" => "bibcode",
    "doi" => "DOI",
    "ean13" => "EAN13",
    "eissn" => "EISSN",
    "handle" => "Handle",
    "igsn" => "IGSN",
    "isbn" => "ISBN",
    "issn" => "ISSN",
    "istc" => "ISTC",
    "lissn" => "LISSN",
    "lsid" => "LSID",
    "pmid" => "PMID",
    "purl" => "PURL",
    "upc" => "UPC",
    "url" => "URL",
    "urn" => "URN",
    "md5" => "md5",
    "minid" => "minid",
    "dataguid" => "dataguid",
  }

  identifierTypes[identifier_type.downcase] || identifier_type
end

#get_iso8601_date(iso8601_time) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1031

def get_iso8601_date(iso8601_time)
  return nil if iso8601_time.nil? || iso8601_time.length < 4

  case iso8601_time.length
  when 4
    iso8601_time[0..3]
  when 7
    iso8601_time[0..6]
  else
    iso8601_time[0..9]
  end
end

#get_link(links, link_type) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1097

def get_link(links, link_type)
  ll = Array.wrap(links).find { |d| d["rel"] == link_type } || {}
  ll.fetch("href", nil)
end

#get_series_information(str) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1159

def get_series_information(str)
  return {} unless str.present?

  str = str.split(",").map(&:strip)

  title = str.first
  volume_issue = str.length > 2 ? str[1].rpartition(/\(([^)]+)\)/) : nil
  volume = volume_issue.present? ? volume_issue[0].presence || volume_issue[2].presence : nil
  issue = volume_issue.present? ? volume_issue[1][1...-1].presence : nil
  pages = str.length > 1 ? str.last : nil
  first_page = pages.present? ? pages.split("-").map(&:strip)[0] : nil
  last_page = pages.present? ? pages.split("-").map(&:strip)[1] : nil

  {
    "title" => title,
    "volume" => volume,
    "issue" => issue,
    "firstPage" => first_page,
    "lastPage" => last_page,
  }.compact
end

#get_year_month(iso8601_time) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1044

def get_year_month(iso8601_time)
  return [] if iso8601_time.nil?

  year = iso8601_time[0..3]
  month = iso8601_time[5..6]

  [year.to_i, month.to_i].reject { |part| part == 0 }
end

#get_year_month_day(iso8601_time) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1053

def get_year_month_day(iso8601_time)
  return [] if iso8601_time.nil?

  year = iso8601_time[0..3]
  month = iso8601_time[5..6]
  day = iso8601_time[8..9]

  [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }
end

#github_as_cff_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 988

def github_as_cff_url(url)
  github_hash = github_from_url(url)

  if github_hash[:path].to_s.end_with?("CITATION.cff")
    "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}"
  elsif github_hash[:owner].present?
    "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/main/CITATION.cff"
  end
end

#github_as_codemeta_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 978

def github_as_codemeta_url(url)
  github_hash = github_from_url(url)

  if github_hash[:path].to_s.end_with?("codemeta.json")
    "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}"
  elsif github_hash[:owner].present?
    "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json"
  end
end

#github_as_owner_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 959

def github_as_owner_url(url)
  github_hash = github_from_url(url)
  "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present?
end

#github_as_release_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 971

def github_as_release_url(url)
  github_hash = github_from_url(url)
  return unless github_hash[:release].present?

  "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}"
end

#github_as_repo_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 964

def github_as_repo_url(url)
  github_hash = github_from_url(url)
  return unless github_hash[:repo].present?

  "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}"
end

#github_from_url(url) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 938

def github_from_url(url)
  return {} unless %r{\Ahttps://github\.com/(.+)(?:/)?(.+)?(?:/tree/)?(.*)\z}.match?(url)

  words = URI.parse(url).path[1..-1].split("/")
  path = words.length > 3 ? words[4...words.length].join("/") : nil

  { owner: words[0], repo: words[1], release: words[3], path: path }.compact
end

#github_owner_from_url(url) ⇒ `Object`



955
956
957

# File 'lib/commonmeta/utils.rb', line 955

def github_owner_from_url(url)
  github_from_url(url).fetch(:owner, nil)
end

#github_release_from_url(url) ⇒ `Object`



951
952
953

# File 'lib/commonmeta/utils.rb', line 951

def github_release_from_url(url)
  github_from_url(url).fetch(:release, nil)
end

#github_repo_from_url(url) ⇒ `Object`



947
948
949

# File 'lib/commonmeta/utils.rb', line 947

def github_repo_from_url(url)
  github_from_url(url).fetch(:repo, nil)
end

#hsh_to_fos(hsh) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1285

def hsh_to_fos(hsh)
  # first find subject in Fields of Science (OECD)
  fos = JSON.load(File.read(File.expand_path("../../resources/oecd/fos-mappings.json",
                                             __dir__))).fetch("fosFields")
  subject = fos.find do |l|
    l["fosLabel"] == hsh["__content__"] || "FOS: " + l["fosLabel"] == hsh["__content__"] || l["fosLabel"] == hsh["subject"]
  end

  if subject
    return [{
             "subject" => sanitize(hsh["__content__"] || hsh["subject"]),
             "subjectScheme" => hsh["subjectScheme"],
             "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"],
             "valueUri" => hsh["valueURI"] || hsh["valueUri"],
             "classificationCode" => hsh["classificationCode"],
             "lang" => hsh["lang"],
           }.compact,
            {
             "subject" => "FOS: " + subject["fosLabel"],
             "subjectScheme" => "Fields of Science and Technology (FOS)",
             "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
           }.compact]
  end

  # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification)
  # and map to Fields of Science. Add an extra entry for the latter
  fores = JSON.load(File.read(File.expand_path("../../resources/oecd/for-mappings.json",
                                               __dir__)))
  for_fields = fores.fetch("forFields")
  for_disciplines = fores.fetch("forDisciplines")

  # try to extract forId
  if hsh["subjectScheme"] == "FOR"
    for_id = hsh["__content__"].to_s.split(" ").first || hsh["subject"].to_s.split(" ").first
    for_id = for_id.rjust(6, "0")

    subject = for_fields.find { |l| l["forId"] == for_id } ||
              for_disciplines.find { |l| l["forId"] == for_id[0..3] }
  else
    subject = for_fields.find do |l|
      l["forLabel"] == hsh["__content__"] || l["forLabel"] == hsh["subject"]
    end ||
              for_disciplines.find do |l|
                l["forLabel"] == hsh["__content__"] || l["forLabel"] == hsh["subject"]
              end
  end

  if subject
    [{
      "subject" => sanitize(hsh["__content__"] || hsh["subject"]),
      "subjectScheme" => hsh["subjectScheme"],
      "classificationCode" => hsh["classificationCode"],
      "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"],
      "valueUri" => hsh["valueURI"] || hsh["valueUri"],
      "lang" => hsh["lang"],
    }.compact,
     {
      "subject" => "FOS: " + subject["fosLabel"],
      "subjectScheme" => "Fields of Science and Technology (FOS)",
      "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
    }]
  else
    [{
      "subject" => sanitize(hsh["__content__"] || hsh["subject"]),
      "subjectScheme" => hsh["subjectScheme"],
      "classificationCode" => hsh["classificationCode"],
      "schemeUri" => hsh["schemeURI"] || hsh["schemeUri"],
      "valueUri" => hsh["valueURI"] || hsh["valueUri"],
      "lang" => hsh["lang"],
    }.compact]
  end
end

#hsh_to_spdx(hsh) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1204

def hsh_to_spdx(hsh)
  spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json",
                                              __dir__))).fetch("licenses")
  hsh["rightsUri"] = hsh.delete("rightsURI") if hsh["rightsUri"].blank?
  license = spdx.find do |l|
    l["licenseId"].casecmp?(hsh["rightsIdentifier"]) || l["seeAlso"].first == normalize_cc_url(hsh["rightsUri"]) || l["name"] == hsh["rights"] || l["seeAlso"].first == normalize_cc_url(hsh["rights"])
  end

  if license
    { "id" => license["licenseId"], "url" => license["seeAlso"].first }.compact
  else
    {
      "id" => hsh["rightsIdentifier"].present? ? hsh["rightsIdentifier"].downcase : nil,
      "url" => hsh["rightsURI"] || hsh["rightsUri"],
    }.compact
  end
end

#json_feed_url(id = nil) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1384

def json_feed_url(id = nil)
  return "https://rogue-scholar.org/api/blogs/#{id}" if id.present?

  "https://rogue-scholar.org/api/posts"
end

#jsonlint(json) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1181

def jsonlint(json)
  return ["No JSON provided"] unless json.present?

  error_array = []
  linter = JsonLint::Linter.new
  linter.send(:check_data, json, error_array)
  error_array
end

#map_hash_keys(element: nil, mapping: nil) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 846

def map_hash_keys(element: nil, mapping: nil)
  Array.wrap(element).map do |a|
    a.map { |k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)|
      if k == "affiliation" && v.is_a?(Array)
        hsh[k] = v.map do |affiliation|
          if affiliation.is_a?(Hash)
            affiliation.merge("@type" => "Organization")
          else
            affiliation
          end
        end
        hsh
      elsif k == "type" && v.is_a?(String)
        hsh[k] = v.capitalize
        hsh
      elsif v.is_a?(Hash)
        hsh[k] = to_schema_org(v)
        hsh
      else
        hsh[k] = v
        hsh
      end
    end
  end.unwrap
end

#name_to_fos(name) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1243

def name_to_fos(name)
  # first find subject in Fields of Science (OECD)
  fos = JSON.load(File.read(File.expand_path("../../resources/oecd/fos-mappings.json",
                                             __dir__))).fetch("fosFields")

  subject = fos.find { |l| l["fosLabel"] == name || "FOS: " + l["fosLabel"] == name }

  if subject
    return [{
             "subject" => sanitize(name).downcase,
           },
            {
             "subject" => "FOS: " + subject["fosLabel"],
             "subjectScheme" => "Fields of Science and Technology (FOS)",
             "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
           }]
  end

  # if not found, look in Fields of Research (Australian and New Zealand Standard Research Classification)
  # and map to Fields of Science. Add an extra entry for the latter
  fores = JSON.load(File.read(File.expand_path("../../resources/oecd/for-mappings.json",
                                               __dir__)))
  for_fields = fores.fetch("forFields")
  for_disciplines = fores.fetch("forDisciplines")

  subject = for_fields.find { |l| l["forLabel"] == name } ||
            for_disciplines.find { |l| l["forLabel"] == name }

  if subject
    [{
      "subject" => sanitize(name).downcase,
    },
     {
      "subject" => "FOS: " + subject["fosLabel"],
      "subjectScheme" => "Fields of Science and Technology (FOS)",
      "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
    }]
  else
    [{ "subject" => sanitize(name).downcase }]
  end
end

#name_to_spdx(name) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1190

def name_to_spdx(name)
  spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json",
                                              __dir__))).fetch("licenses")
  license = spdx.find do |l|
    l["name"] == name || l["licenseId"] == name || l["seeAlso"].first == normalize_cc_url(name)
  end

  if license
    { "id" => license["licenseId"], "url" => license["seeAlso"].first }.compact
  else
    { "rights" => name }
  end
end

#normalize_cc_url(id) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 612

def normalize_cc_url(id)
  id = normalize_url(id, https: true)
  NORMALIZED_LICENSES.fetch(id, id)
end

#normalize_id(id, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 573

def normalize_id(id, options = {})
  return nil unless id.present?

  # check for valid DOI
  doi = normalize_doi(id, options)
  return doi if doi.present?

  # check for valid HTTP uri
  uri = Addressable::URI.parse(id)
  return nil unless uri && uri.host && %w[http https].include?(uri.scheme)

  # clean up URL
  PostRank::URI.clean(id)
rescue Addressable::URI::InvalidURIError
  nil
end

#normalize_issn(input, options = {}) ⇒ `Object`

pick electronic issn if there are multiple format issn as xxxx-xxxx

# File 'lib/commonmeta/utils.rb', line 627

def normalize_issn(input, options = {})
  content = options[:content] || "__content__"

  issn = if input.blank?
      nil
    elsif input.is_a?(String) && options[:content].nil?
      input
    elsif input.is_a?(Hash)
      input.fetch(content, nil)
    elsif input.is_a?(Array)
      a = input.find { |a| a["media_type"] == "electronic" } || input.first
      a.fetch(content, nil)
    end

  case issn.to_s.length
  when 9
    issn
  when 8
    issn[0..3] + "-" + issn[4..7]
  end
end

#normalize_licenses(licenses) ⇒ `Object`

find Creative Commons or OSI license in licenses array, normalize url and name

# File 'lib/commonmeta/utils.rb', line 650

def normalize_licenses(licenses)
  standard_licenses = Array.wrap(licenses).map do |l|
    URI.parse(l["url"])
  end.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] }
  return licenses unless standard_licenses.present?

  # use HTTPS
  uri.scheme = "https"

  # use host name without subdomain
  uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last

  # normalize URLs
  if uri.host == "creativecommons.org"
    uri.path = uri.path.split("/")[0..-2].join("/") if uri.path.split("/").last == "legalcode"
    uri.path << "/" unless uri.path.end_with?("/")
  else
    uri.path = uri.path.gsub(/(-license|\.php|\.html)/, "")
    uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase }
    uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize }
    uri.path = uri.path.sub(/([^0-9-]+)(-)?([1-9])?(\.)?([0-9])?$/) do
      m = Regexp.last_match
      text = m[1]

      if m[3].present?
        version = [m[3], m[5].presence || "0"].join(".")
        [text, version].join("-")
      else
        text
      end
    end
  end

  uri.to_s
rescue URI::InvalidURIError
  nil
end

#normalize_orcid(orcid) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 617

def normalize_orcid(orcid)
  orcid = validate_orcid(orcid)
  return nil unless orcid.present?

  # turn ORCID ID into URL
  "https://orcid.org/" + Addressable::URI.encode(orcid)
end

#normalize_url(id, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 590

def normalize_url(id, options = {})
  return nil unless id.present?

  # handle info URIs
  return id if id.to_s.start_with?("info")

  # check for valid HTTP uri
  uri = Addressable::URI.parse(id)

  return nil unless uri && uri.host && %w[http https ftp].include?(uri.scheme)

  # optionally turn into https URL
  uri.scheme = "https" if options[:https]

  # clean up URL
  uri.path = PostRank::URI.clean(uri.path)

  uri.to_s
rescue Addressable::URI::InvalidURIError
  nil
end

#orcid_as_url(orcid) ⇒ `Object`



537
538
539

# File 'lib/commonmeta/utils.rb', line 537

def orcid_as_url(orcid)
  "https://orcid.org/#{orcid}" if orcid.present?
end

#orcid_from_url(url) ⇒ `Object`



533
534
535

# File 'lib/commonmeta/utils.rb', line 533

def orcid_from_url(url)
  Array(%r{\A:(http|https)://orcid\.org/(.+)}.match(url)).last
end

#parse_attributes(element, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 560

def parse_attributes(element, options = {})
  content = options[:content] || "__content__"

  if element.is_a?(String) && options[:content].nil?
    CGI.unescapeHTML(element)
  elsif element.is_a?(Hash)
    element.fetch(CGI.unescapeHTML(content), nil)
  elsif element.is_a?(Array)
    a = element.map { |e| e.is_a?(Hash) ? e.fetch(CGI.unescapeHTML(content), nil) : e }.uniq
    a = options[:first] ? a.first : a.unwrap
  end
end

#rogue_scholar_api_url(id, _options = {}) ⇒ `Object`



1102
1103
1104

# File 'lib/commonmeta/utils.rb', line 1102

def rogue_scholar_api_url(id, _options = {})
  "https://rogue-scholar.org/api/posts/#{id}"
end

#sanitize(text, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 922

def sanitize(text, options = {})
  options[:tags] ||= Set.new(%w[strong em b i code pre sub sup br])
  content = options[:content] || "__content__"
  custom_scrubber = Commonmeta::WhitelistScrubber.new(options)

  if text.is_a?(String)
    # remove excessive internal whitespace with squish
    Loofah.scrub_fragment(text, custom_scrubber).to_s.squish
  elsif text.is_a?(Hash)
    sanitize(text.fetch(content, nil))
  elsif text.is_a?(Array)
    a = text.map { |e| e.is_a?(Hash) ? sanitize(e.fetch(content, nil)) : sanitize(e) }.uniq
    a = options[:first] ? a.first : a.unwrap
  end
end

#spdx_to_hsh(hsh) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 1222

def spdx_to_hsh(hsh)
  return nil unless hsh.present? && hsh.is_a?(Hash)

  spdx = JSON.load(File.read(File.expand_path("../../resources/spdx/licenses.json",
                                              __dir__))).fetch("licenses")

  license = spdx.find { |l| l["licenseId"].casecmp?(hsh["id"]) }

  if license
    [{
      "rightsIdentifier" => license["licenseId"].downcase,
      "rightsUri" => license["seeAlso"].first,
      "rights" => license["name"],
      "rightsIdentifierScheme" => "SPDX",
      "schemeUri" => "https://spdx.org/licenses/",
    }.compact]
  else
    [{ "rightsIdentifier" => hsh["id"], "rightsURI" => hsh["url"] }.compact]
  end
end

#strip_milliseconds(iso8601_time) ⇒ `Object`

strip milliseconds if there is a time, as it interferes with edtc parsing keep dates unchanged

# File 'lib/commonmeta/utils.rb', line 1074

def strip_milliseconds(iso8601_time)
  return iso8601_time.split(" ").first if iso8601_time.to_s.include? " "

  return iso8601_time.split(".").first + "Z" if iso8601_time.to_s.include? "."

  return iso8601_time.split("+").first + "Z" if iso8601_time.to_s.include? "+"

  iso8601_time
end

#to_csl(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 902

def to_csl(element)
  Array.wrap(element).map do |a|
    a["family"] = a["familyName"]
    a["given"] = a["givenName"]
    a["literal"] = a["name"] unless a["familyName"].present?
    a.except("nameType", "type", "@type", "id", "@id", "name", "familyName", "givenName",
             "affiliation", "contributorType").compact
  end.presence
end

#to_datacite(element, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 688

def to_datacite(element, options = {})
  a = Array.wrap(element).map do |e|
    e.each_with_object({}) do |(k, v), h|
      h[k.dasherize] = v
    end
  end
  options[:first] ? a.unwrap : a.presence
end

#to_identifier(identifier) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 872

def to_identifier(identifier)
  {
    "@type" => "PropertyValue",
    "propertyID" => identifier["relatedIdentifierType"],
    "value" => identifier["relatedIdentifier"],
  }
end

#to_ris(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 912

def to_ris(element)
  Array.wrap(element).map do |a|
    if a["familyName"].present?
      [a["familyName"], a["givenName"]].join(", ")
    else
      a["name"]
    end
  end.unwrap
end

#to_schema_org(element) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 703

def to_schema_org(element)
  mapping = { "type" => "@type", "id" => "@id", "title" => "name" }

  map_hash_keys(element: element, mapping: mapping)
end

#to_schema_org_citation(reference) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 766

def to_schema_org_citation(reference)
  return nil unless reference.present?

  {
    "@type" => "CreativeWork",
    "@id" => reference["doi"] ? normalize_id(reference["doi"]) : nil,
    "name" => reference["title"],
    "datePublished" => reference["publicationYear"],
  }.compact
end

#to_schema_org_container(element, options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 709

def to_schema_org_container(element, options = {})
  return nil unless element.is_a?(Hash) || (element.nil? && options[:container_title].present?)

  issn = element["identifier"] if element["identifierType"] == "ISSN"
  id = issn.blank? ? element["identifier"] : nil
  name = options[:container_title] || element["title"]
  type = id || name ? options[:type] || element["type"] : nil

  { "@id" => id, "@type" => type, "name" => name, "issn" => issn }.compact
end

#to_schema_org_funder(funding_references) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 754

def to_schema_org_funder(funding_references)
  return nil unless funding_references.present?

  Array.wrap(funding_references).map do |fr|
    {
      "@id" => fr["funderIdentifier"],
      "@type" => "Organization",
      "name" => fr["funderName"],
    }.compact
  end.unwrap
end

#to_schema_org_identifiers(element, _options = {}) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 720

def to_schema_org_identifiers(element, _options = {})
  Array.wrap(element).map do |ai|
    {
      "@type" => "PropertyValue",
      "propertyID" => ai["alternateIdentifierType"],
      "value" => ai["alternateIdentifier"],
    }
  end.unwrap
end

#to_schema_org_relation(related_identifiers: nil, relation_type: nil) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 730

def to_schema_org_relation(related_identifiers: nil, relation_type: nil)
  return nil unless related_identifiers.present? && relation_type.present?

  relation_type = if relation_type == "References"
      %w[References Cites
         Documents]
    else
      [relation_type]
    end

  Array.wrap(related_identifiers).select do |ri|
    relation_type.include?(ri["relationType"])
  end.map do |r|
    if r["relatedIdentifierType"] == "ISSN" && r["relationType"] == "IsPartOf"
      { "@type" => "Periodical", "issn" => r["relatedIdentifier"] }.compact
    else
      {
        "@id" => normalize_id(r["relatedIdentifier"]),
        "@type" => DC_TO_SO_TRANSLATIONS[r["resourceTypeGeneral"]] || "CreativeWork",
      }.compact
    end
  end.unwrap
end

#to_schema_org_spatial_coverage(geo_location) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 777

def to_schema_org_spatial_coverage(geo_location)
  return nil unless geo_location.present?

  Array.wrap(geo_location).each_with_object([]) do |gl, sum|
    if gl.fetch("geoLocationPoint", nil)
      sum << {
        "@type" => "Place",
        "geo" => {
          "@type" => "GeoCoordinates",
          "address" => gl["geoLocationPlace"],
          "latitude" => gl.dig("geoLocationPoint", "pointLatitude"),
          "longitude" => gl.dig("geoLocationPoint", "pointLongitude"),
        },
      }.compact
    end

    if gl.fetch("geoLocationBox", nil)
      sum << {
        "@type" => "Place",
        "geo" => {
          "@type" => "GeoShape",
          "address" => gl["geoLocationPlace"],
          "box" => [gl.dig("geoLocationBox", "southBoundLatitude"),
                    gl.dig("geoLocationBox", "westBoundLongitude"),
                    gl.dig("geoLocationBox", "northBoundLatitude"),
                    gl.dig("geoLocationBox", "eastBoundLongitude")].compact.join(" ").presence,
        }.compact,
      }.compact
    end

    if gl.fetch("geoLocationPolygon", nil)
      sum << {
        "@type" => "Place",
        "geo" => {
          "@type" => "GeoShape",
          "address" => gl["geoLocationPlace"],
          "polygon" => Array.wrap(gl.dig("geoLocationPolygon")).map do |glp|
            Array.wrap(glp).map do |glpp|
              [glpp.dig("polygonPoint", "pointLongitude"),
               glpp.dig("polygonPoint", "pointLatitude")].compact
            end.compact
          end.compact.presence,
        },
      }
    end

    next unless gl.fetch("geoLocationPlace",
                         nil) && !gl.fetch("geoLocationPoint",
                                           nil) && !gl.fetch("geoLocationBox",
                                                             nil) && !gl.fetch(
      "geoLocationPolygon", nil
    )

    sum << {
      "@type" => "Place",
      "geo" => {
        "@type" => "GeoCoordinates",
        "address" => gl["geoLocationPlace"],
      },
    }.compact
  end.unwrap
end

#validate_orcid(orcid) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 541

def validate_orcid(orcid)
  orcid = Array(%r{\A(?:(?:http|https)://(?:(?:www|sandbox)?\.)?orcid\.org/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z}.match(orcid)).last
  orcid.gsub(/[[:space:]]/, "-") if orcid.present?
end

#validate_orcid_scheme(orcid_scheme) ⇒ `Object`



546
547
548

# File 'lib/commonmeta/utils.rb', line 546

def validate_orcid_scheme(orcid_scheme)
  Array(%r{\A(http|https)://(www\.)?(orcid\.org)}.match(orcid_scheme)).last
end

#validate_url(str) ⇒ `Object`

# File 'lib/commonmeta/utils.rb', line 550

def validate_url(str)
  if %r{\A(?:(http|https)://(dx\.)?doi.org/)?(doi:)?(10\.\d{4,5}/.+)\z}.match?(str)
    "DOI"
  elsif %r{\A(http|https)://}.match?(str)
    "URL"
  elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match?(str)
    "ISSN"
  end
end

Module: Commonmeta::Utils

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#decode_container_id(id) ⇒ Object

#decode_doi(doi) ⇒ Object

#encode_container_id ⇒ Object

#encode_doi(prefix) ⇒ Object

#find_from_format(id: nil, string: nil, ext: nil, filename: nil) ⇒ Object

#find_from_format_by_ext(string, options = {}) ⇒ Object

#find_from_format_by_filename(filename) ⇒ Object

#find_from_format_by_id(id) ⇒ Object

#find_from_format_by_string(string) ⇒ Object

#from_csl(element) ⇒ Object

#from_datacite(element) ⇒ Object

#from_json_feed(element) ⇒ Object

#from_schema_org(element) ⇒ Object

#get_contributor(contributor, contributor_type) ⇒ Object

#get_date(dates, date_type) ⇒ Object

#get_date_from_date_parts(date_as_parts) ⇒ Object

#get_date_from_parts(year, month = nil, day = nil) ⇒ Object

#get_date_parts(iso8601_time) ⇒ Object

#get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object

#get_dates_from_date(date) ⇒ Object

#get_datetime_from_iso8601(iso8601_time) ⇒ Object

#get_datetime_from_time(time) ⇒ Object

#get_identifier(identifiers, identifier_type) ⇒ Object

#get_identifier_type(identifier_type) ⇒ Object

#get_iso8601_date(iso8601_time) ⇒ Object

#get_link(links, link_type) ⇒ Object

#get_series_information(str) ⇒ Object

#get_year_month(iso8601_time) ⇒ Object

#get_year_month_day(iso8601_time) ⇒ Object

#github_as_cff_url(url) ⇒ Object

#github_as_codemeta_url(url) ⇒ Object

#github_as_owner_url(url) ⇒ Object

#github_as_release_url(url) ⇒ Object

#github_as_repo_url(url) ⇒ Object

#github_from_url(url) ⇒ Object

#github_owner_from_url(url) ⇒ Object

#github_release_from_url(url) ⇒ Object

#github_repo_from_url(url) ⇒ Object

#hsh_to_fos(hsh) ⇒ Object

#hsh_to_spdx(hsh) ⇒ Object

#json_feed_url(id = nil) ⇒ Object

#jsonlint(json) ⇒ Object

#map_hash_keys(element: nil, mapping: nil) ⇒ Object

#name_to_fos(name) ⇒ Object

#name_to_spdx(name) ⇒ Object

#normalize_cc_url(id) ⇒ Object

#normalize_id(id, options = {}) ⇒ Object

#normalize_issn(input, options = {}) ⇒ Object

#normalize_licenses(licenses) ⇒ Object

#normalize_orcid(orcid) ⇒ Object

#normalize_url(id, options = {}) ⇒ Object

#orcid_as_url(orcid) ⇒ Object

#orcid_from_url(url) ⇒ Object

#parse_attributes(element, options = {}) ⇒ Object

#rogue_scholar_api_url(id, _options = {}) ⇒ Object

#sanitize(text, options = {}) ⇒ Object

#spdx_to_hsh(hsh) ⇒ Object

#strip_milliseconds(iso8601_time) ⇒ Object

#to_csl(element) ⇒ Object

#to_datacite(element, options = {}) ⇒ Object

#to_identifier(identifier) ⇒ Object

#to_ris(element) ⇒ Object

#to_schema_org(element) ⇒ Object

#to_schema_org_citation(reference) ⇒ Object

#to_schema_org_container(element, options = {}) ⇒ Object

#to_schema_org_funder(funding_references) ⇒ Object

#to_schema_org_identifiers(element, _options = {}) ⇒ Object

#to_schema_org_relation(related_identifiers: nil, relation_type: nil) ⇒ Object

#to_schema_org_spatial_coverage(geo_location) ⇒ Object

#validate_orcid(orcid) ⇒ Object

#validate_orcid_scheme(orcid_scheme) ⇒ Object

#validate_url(str) ⇒ Object