Module: Bolognese::Utils
- Included in:
- CLI, MetadataUtils
- Defined in:
- lib/bolognese/utils.rb
Constant Summary collapse
- LICENSE_NAMES =
{ "http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)", "http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)", "http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)", "http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)", "http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)", "http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)" }
- DC_TO_SO_TRANSLATIONS =
{ "Audiovisual" => "MediaObject", "Collection" => "Collection", "Dataset" => "Dataset", "Event" => "Event", "Image" => "ImageObject", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => "Service", "Software" => "SoftwareSourceCode", "Sound" => "AudioObject", "Text" => "ScholarlyArticle", "Workflow" => nil, "Other" => "CreativeWork", # not part of DataCite schema, but used internally "Periodical" => "Periodical", "DataCatalog" => "DataCatalog" }
- DC_TO_CP_TRANSLATIONS =
{ "Audiovisual" => "motion_picture", "Collection" => nil, "Dataset" => "dataset", "Event" => nil, "Image" => "graphic", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Sound" => "song", "Text" => "report", "Workflow" => nil, "Other" => nil }
- CR_TO_CP_TRANSLATIONS =
{ "Proceedings" => nil, "ReferenceBook" => nil, "JournalIssue" => nil, "ProceedingsArticle" => "paper-conference", "Other" => nil, "Dissertation" => "thesis", "Dataset" => "dataset", "EditedBook" => "book", "JournalArticle" => "article-journal", "Journal" => nil, "Report" => "report", "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "chapter", "BookPart" => nil, "Book" => "book", "BookChapter" => "chapter", "StandardSeries" => nil, "Monograph" => "book", "Component" => nil, "ReferenceEntry" => "entry-dictionary", "JournalVolume" => nil, "BookSet" => nil }
- CR_TO_SO_TRANSLATIONS =
{ "Proceedings" => nil, "ReferenceBook" => "Book", "JournalIssue" => "PublicationIssue", "ProceedingsArticle" => nil, "Other" => "CreativeWork", "Dissertation" => "Thesis", "Dataset" => "Dataset", "EditedBook" => "Book", "JournalArticle" => "ScholarlyArticle", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => nil, "BookPart" => nil, "Book" => "Book", "BookChapter" => "Chapter", "StandardSeries" => nil, "Monograph" => "Book", "Component" => "CreativeWork", "ReferenceEntry" => nil, "JournalVolume" => "PublicationVolume", "BookSet" => nil, "PostedContent" => "ScholarlyArticle" }
- CR_TO_BIB_TRANSLATIONS =
{ "Proceedings" => "proceedings", "ReferenceBook" => "book", "JournalIssue" => nil, "ProceedingsArticle" => nil, "Other" => nil, "Dissertation" => "phdthesis", "Dataset" => nil, "EditedBook" => "book", "JournalArticle" => "article", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "inbook", "BookPart" => nil, "Book" => "book", "BookChapter" => "inbook", "StandardSeries" => nil, "Monograph" => "book", "Component" => nil, "ReferenceEntry" => nil, "JournalVolume" => nil, "BookSet" => nil, "PostedContent" => "article" }
- BIB_TO_CR_TRANSLATIONS =
{ "proceedings" => "Proceedings", "phdthesis" => "Dissertation", "article" => "JournalArticle", "book" => "Book", "inbook" => "BookChapter" }
- CR_TO_JATS_TRANSLATIONS =
{ "Proceedings" => "working-paper", "ReferenceBook" => "book", "JournalIssue" => "journal", "ProceedingsArticle" => "working-paper", "Other" => nil, "Dissertation" => nil, "Dataset" => "data", "EditedBook" => "book", "JournalArticle" => "journal", "Journal" => "journal", "Report" => "report", "BookSeries" => "book", "ReportSeries" => "report", "BookTrack" => "book", "Standard" => "standard", "BookSection" => "chapter", "BookPart" => "chapter", "Book" => "book", "BookChapter" => "chapter", "StandardSeries" => "standard", "Monograph" => "book", "Component" => nil, "ReferenceEntry" => nil, "JournalVolume" => "journal", "BookSet" => "book" }
- SO_TO_DC_TRANSLATIONS =
{ "Article" => "Text", "AudioObject" => "Sound", "Blog" => "Text", "BlogPosting" => "Text", "Chapter" => "Text", "Collection" => "Collection", "DataCatalog" => "Dataset", "Dataset" => "Dataset", "Event" => "Event", "ImageObject" => "Image", "Movie" => "Audiovisual", "PublicationIssue" => "Text", "ScholarlyArticle" => "Text", "Thesis" => "Text", "Service" => "Service", "SoftwareSourceCode" => "Software", "VideoObject" => "Audiovisual", "WebPage" => "Text", "WebSite" => "Text" }
- SO_TO_JATS_TRANSLATIONS =
{ "Article" => "journal", "AudioObject" => nil, "Blog" => nil, "BlogPosting" => nil, "Book" => "book", "Collection" => nil, "CreativeWork" => nil, "DataCatalog" => "data", "Dataset" => "data", "Event" => nil, "ImageObject" => nil, "Movie" => nil, "PublicationIssue" => "journal", "ScholarlyArticle" => "journal", "Service" => nil, "SoftwareSourceCode" => "software", "VideoObject" => nil, "WebPage" => nil, "WebSite" => "website" }
- SO_TO_CP_TRANSLATIONS =
{ "Article" => "", "AudioObject" => "song", "Blog" => "report", "BlogPosting" => "post-weblog", "Collection" => nil, "CreativeWork" => nil, "DataCatalog" => "dataset", "Dataset" => "dataset", "Event" => nil, "ImageObject" => "graphic", "Movie" => "motion_picture", "PublicationIssue" => nil, "ScholarlyArticle" => "article-journal", "Service" => nil, "Thesis" => "thesis", "VideoObject" => "broadcast", "WebPage" => "webpage", "WebSite" => "webpage" }
- SO_TO_RIS_TRANSLATIONS =
{ "Article" => nil, "AudioObject" => nil, "Blog" => nil, "BlogPosting" => "BLOG", "Collection" => nil, "CreativeWork" => "GEN", "DataCatalog" => "CTLG", "Dataset" => "DATA", "Event" => nil, "ImageObject" => "FIGURE", "Movie" => "MPCT", "PublicationIssue" => nil, "ScholarlyArticle" => "JOUR", "Service" => nil, "SoftwareSourceCode" => "COMP", "VideoObject" => "VIDEO", "WebPage" => "ELEC", "WebSite" => nil }
- CR_TO_RIS_TRANSLATIONS =
{ "Proceedings" => "CONF", "ReferenceBook" => "BOOK", "JournalIssue" => nil, "ProceedingsArticle" => "CPAPER", "Other" => "GEN", "Dissertation" => "THES", "Dataset" => "DATA", "EditedBook" => "BOOK", "JournalArticle" => "JOUR", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "CHAP", "BookPart" => "CHAP", "Book" => "BOOK", "BookChapter" => "CHAP", "StandardSeries" => nil, "Monograph" => "BOOK", "Component" => nil, "ReferenceEntry" => "DICT", "JournalVolume" => nil, "BookSet" => nil }
- DC_TO_RIS_TRANSLATIONS =
{ "Audiovisual" => "MPCT", "Collection" => nil, "Dataset" => "DATA", "Event" => nil, "Image" => "FIGURE", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Software" => "COMP", "Sound" => "SOUND", "Text" => "RPRT", "Workflow" => nil, "Other" => nil }
- SO_TO_BIB_TRANSLATIONS =
{ "Article" => "article", "AudioObject" => "misc", "Thesis" => "phdthesis", "Blog" => "misc", "BlogPosting" => "article", "Collection" => "misc", "CreativeWork" => "misc", "DataCatalog" => "misc", "Dataset" => "misc", "Event" => "misc", "ImageObject" => "misc", "Movie" => "misc", "PublicationIssue" => "misc", "ScholarlyArticle" => "article", "Service" => "misc", "SoftwareSourceCode" => "misc", "VideoObject" => "misc", "WebPage" => "misc", "WebSite" => "misc" }
Instance Method Summary collapse
- #find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
- #find_from_format_by_ext(string, options = {}) ⇒ Object
- #find_from_format_by_id(id) ⇒ Object
- #find_from_format_by_string(string) ⇒ Object
- #from_citeproc(element) ⇒ Object
- #from_datacite_json(element) ⇒ Object
- #from_schema_org(element) ⇒ Object
- #get_contributor(contributor, contributor_type) ⇒ Object
- #get_date(dates, date_type) ⇒ Object
- #get_date_from_date_parts(date_as_parts) ⇒ Object
- #get_date_from_parts(year, month = nil, day = nil) ⇒ Object
- #get_date_parts(iso8601_time) ⇒ Object
- #get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object
-
#get_datetime_from_iso8601(iso8601_time) ⇒ Object
parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library return nil if invalid iso8601 timestamp.
- #get_year_month(iso8601_time) ⇒ Object
- #get_year_month_day(iso8601_time) ⇒ Object
- #github_as_codemeta_url(url) ⇒ Object
- #github_as_owner_url(url) ⇒ Object
- #github_as_release_url(url) ⇒ Object
- #github_as_repo_url(url) ⇒ Object
- #github_from_url(url) ⇒ Object
- #github_owner_from_url(url) ⇒ Object
- #github_release_from_url(url) ⇒ Object
- #github_repo_from_url(url) ⇒ Object
- #jsonlint(json) ⇒ Object
- #map_hash_keys(element: nil, mapping: nil) ⇒ Object
- #normalize_id(id, options = {}) ⇒ Object
- #normalize_ids(ids: nil, relation_type: nil) ⇒ Object
-
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name.
- #normalize_orcid(orcid) ⇒ Object
- #normalize_url(id) ⇒ Object
- #orcid_as_url(orcid) ⇒ Object
- #orcid_from_url(url) ⇒ Object
- #parse_attributes(element, options = {}) ⇒ Object
- #sanitize(text, options = {}) ⇒ Object
- #to_citeproc(element) ⇒ Object
- #to_datacite_json(element, options = {}) ⇒ Object
- #to_identifier(identifier) ⇒ Object
- #to_ris(element) ⇒ Object
- #to_schema_org(element) ⇒ Object
- #to_schema_org_container(element, options = {}) ⇒ Object
- #to_schema_org_funder(funding_references) ⇒ Object
- #to_schema_org_identifier(element, options = {}) ⇒ Object
- #to_schema_org_relation(related_identifiers: nil, relation_type: nil) ⇒ Object
- #to_schema_org_spatial_coverage(geo_location) ⇒ Object
- #validate_orcid(orcid) ⇒ Object
- #validate_orcid_scheme(orcid_scheme) ⇒ Object
- #validate_url(str) ⇒ Object
Instance Method Details
#find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
325 326 327 328 329 330 331 332 333 334 335 |
# File 'lib/bolognese/utils.rb', line 325 def find_from_format(id: nil, string: nil, ext: nil) if id.present? find_from_format_by_id(id) elsif ext.present? find_from_format_by_ext(string, ext: ext) elsif string.present? find_from_format_by_string(string) else "datacite" end end |
#find_from_format_by_ext(string, options = {}) ⇒ Object
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
# File 'lib/bolognese/utils.rb', line 352 def find_from_format_by_ext(string, ={}) if [:ext] == ".bib" "bibtex" elsif [:ext] == ".ris" "ris" elsif [:ext] == ".xml" && Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref") "crossref" elsif [:ext] == ".xml" && Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") } "datacite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org") "schema_org" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("types") "crosscite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" end end |
#find_from_format_by_id(id) ⇒ Object
337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
# File 'lib/bolognese/utils.rb', line 337 def find_from_format_by_id(id) id = normalize_id(id) if /\A(?:(http|https):\/(\/)?(dx\.)?(doi.org|handle.test.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id) ra = get_doi_ra(id) %w(DataCite Crossref).include?(ra) ? ra.downcase : nil elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id) "orcid" elsif /\A(http|https):\/(\/)?github\.com\/(.+)\z/.match(id) "codemeta" else "schema_org" end end |
#find_from_format_by_string(string) ⇒ Object
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
# File 'lib/bolognese/utils.rb', line 374 def find_from_format_by_string(string) if Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref").present? "crossref" elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") } "datacite" elsif Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org") "schema_org" elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" elsif Maremma.from_json(string).to_h.dig("schema-version").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif Maremma.from_json(string).to_h.dig("types").present? "crosscite" elsif Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" elsif string.start_with?("TY - ") "ris" elsif BibTeX.parse(string).first "bibtex" end rescue BibTeX::ParseError => error nil end |
#from_citeproc(element) ⇒ Object
668 669 670 671 672 673 674 675 676 677 678 679 680 681 |
# File 'lib/bolognese/utils.rb', line 668 def from_citeproc(element) Array.wrap(element).map do |a| if a["literal"].present? a["@type"] = "Organization" a["name"] = a["literal"] else a["@type"] = "Person" a["name"] = [a["given"], a["family"]].compact.join(" ") end a["givenName"] = a["given"] a["familyName"] = a["family"] a.except("given", "family", "literal").compact end.unwrap end |
#from_datacite_json(element) ⇒ Object
539 540 541 542 543 |
# File 'lib/bolognese/utils.rb', line 539 def from_datacite_json(element) Array.wrap(element).map do |e| e.inject({}) {|h, (k,v)| h[k.underscore] = v; h } end end |
#from_schema_org(element) ⇒ Object
641 642 643 644 645 |
# File 'lib/bolognese/utils.rb', line 641 def from_schema_org(element) mapping = { "@type" => "type", "@id" => "id" } map_hash_keys(element: element, mapping: mapping) end |
#get_contributor(contributor, contributor_type) ⇒ Object
823 824 825 |
# File 'lib/bolognese/utils.rb', line 823 def get_contributor(contributor, contributor_type) contributor.select { |c| c["contributorType"] == contributor_type } end |
#get_date(dates, date_type) ⇒ Object
818 819 820 821 |
# File 'lib/bolognese/utils.rb', line 818 def get_date(dates, date_type) dd = dates.find { |d| d["dateType"] == date_type } || {} dd.fetch("date", nil) end |
#get_date_from_date_parts(date_as_parts) ⇒ Object
776 777 778 779 780 |
# File 'lib/bolognese/utils.rb', line 776 def get_date_from_date_parts(date_as_parts) date_parts = date_as_parts.fetch("date-parts", []).first year, month, day = date_parts[0], date_parts[1], date_parts[2] get_date_from_parts(year, month, day) end |
#get_date_from_parts(year, month = nil, day = nil) ⇒ Object
782 783 784 |
# File 'lib/bolognese/utils.rb', line 782 def get_date_from_parts(year, month = nil, day = nil) [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject { |part| part == "00" }.join("-") end |
#get_date_parts(iso8601_time) ⇒ Object
767 768 769 770 771 772 773 774 |
# File 'lib/bolognese/utils.rb', line 767 def get_date_parts(iso8601_time) return { 'date-parts' => [[]] } if iso8601_time.nil? year = iso8601_time[0..3].to_i month = iso8601_time[5..6].to_i day = iso8601_time[8..9].to_i { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] } end |
#get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object
786 787 788 |
# File 'lib/bolognese/utils.rb', line 786 def get_date_parts_from_parts(year, month = nil, day = nil) { 'date-parts' => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] } end |
#get_datetime_from_iso8601(iso8601_time) ⇒ Object
parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library return nil if invalid iso8601 timestamp
812 813 814 815 816 |
# File 'lib/bolognese/utils.rb', line 812 def get_datetime_from_iso8601(iso8601_time) ISO8601::DateTime.new(iso8601_time).to_time.utc rescue nil end |
#get_year_month(iso8601_time) ⇒ Object
790 791 792 793 794 795 796 797 |
# File 'lib/bolognese/utils.rb', line 790 def get_year_month(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] [year.to_i, month.to_i].reject { |part| part == 0 } end |
#get_year_month_day(iso8601_time) ⇒ Object
799 800 801 802 803 804 805 806 807 |
# File 'lib/bolognese/utils.rb', line 799 def get_year_month_day(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] day = iso8601_time[8..9] [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 } end |
#github_as_codemeta_url(url) ⇒ Object
757 758 759 760 761 762 763 764 765 |
# File 'lib/bolognese/utils.rb', line 757 def (url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?("codemeta.json") "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json" end end |
#github_as_owner_url(url) ⇒ Object
742 743 744 745 |
# File 'lib/bolognese/utils.rb', line 742 def github_as_owner_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present? end |
#github_as_release_url(url) ⇒ Object
752 753 754 755 |
# File 'lib/bolognese/utils.rb', line 752 def github_as_release_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" if github_hash[:release].present? end |
#github_as_repo_url(url) ⇒ Object
747 748 749 750 |
# File 'lib/bolognese/utils.rb', line 747 def github_as_repo_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" if github_hash[:repo].present? end |
#github_from_url(url) ⇒ Object
719 720 721 722 723 724 725 726 727 728 |
# File 'lib/bolognese/utils.rb', line 719 def github_from_url(url) return {} unless /\Ahttps:\/\/github\.com\/(.+)(?:\/)?(.+)?(?:\/tree\/)?(.*)\z/.match(url) words = URI.parse(url).path[1..-1].split('/') path = words.length > 3 ? words[4...words.length].join("/") : nil { owner: words[0], repo: words[1], release: words[3], path: path }.compact end |
#github_owner_from_url(url) ⇒ Object
738 739 740 |
# File 'lib/bolognese/utils.rb', line 738 def github_owner_from_url(url) github_from_url(url).fetch(:owner, nil) end |
#github_release_from_url(url) ⇒ Object
734 735 736 |
# File 'lib/bolognese/utils.rb', line 734 def github_release_from_url(url) github_from_url(url).fetch(:release, nil) end |
#github_repo_from_url(url) ⇒ Object
730 731 732 |
# File 'lib/bolognese/utils.rb', line 730 def github_repo_from_url(url) github_from_url(url).fetch(:repo, nil) end |
#jsonlint(json) ⇒ Object
827 828 829 830 831 832 833 834 |
# File 'lib/bolognese/utils.rb', line 827 def jsonlint(json) return ["No JSON provided"] unless json.present? error_array = [] linter = JsonLint::Linter.new linter.send(:check_data, json, error_array) error_array end |
#map_hash_keys(element: nil, mapping: nil) ⇒ Object
647 648 649 650 651 652 653 654 655 656 657 658 659 |
# File 'lib/bolognese/utils.rb', line 647 def map_hash_keys(element: nil, mapping: nil) Array.wrap(element).map do |a| a.map {|k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)| if v.is_a?(Hash) hsh[k] = to_schema_org(v) hsh else hsh[k] = v hsh end end end.unwrap end |
#normalize_id(id, options = {}) ⇒ Object
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 |
# File 'lib/bolognese/utils.rb', line 440 def normalize_id(id, ={}) return nil unless id.present? # check for valid DOI doi = normalize_doi(id, ) return doi if doi.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#normalize_ids(ids: nil, relation_type: nil) ⇒ Object
482 483 484 485 486 487 488 489 490 491 492 493 |
# File 'lib/bolognese/utils.rb', line 482 def normalize_ids(ids: nil, relation_type: nil) Array.wrap(ids).select { |idx| idx["@id"].present? }.map do |idx| id = normalize_id(idx["@id"]) = doi_from_url(id).present? ? "DOI" : "URL" id = doi_from_url(id) || id { "relatedIdentifier" => id, "relationType" => relation_type, "relatedIdentifierType" => , "resourceTypeGeneral" => Metadata::SO_TO_DC_TRANSLATIONS[idx["@type"]] }.compact end.unwrap end |
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 |
# File 'lib/bolognese/utils.rb', line 496 def normalize_licenses(licenses) standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] } return licenses unless standard_licenses.present? # use HTTPS uri.scheme = "https" # use host name without subdomain uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last # normalize URLs if uri.host == "creativecommons.org" uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode" uri.path << '/' unless uri.path.end_with?('/') else uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '') uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase } uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize } uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do m = Regexp.last_match text = m[1] if m[3].present? version = [m[3], m[5].presence || "0"].join(".") [text, version].join("-") else text end end end uri.to_s rescue URI::InvalidURIError nil end |
#normalize_orcid(orcid) ⇒ Object
474 475 476 477 478 479 480 |
# File 'lib/bolognese/utils.rb', line 474 def normalize_orcid(orcid) orcid = validate_orcid(orcid) return nil unless orcid.present? # turn ORCID ID into URL "http://orcid.org/" + Addressable::URI.encode(orcid) end |
#normalize_url(id) ⇒ Object
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 |
# File 'lib/bolognese/utils.rb', line 457 def normalize_url(id) return nil unless id.present? # handle info URIs return id if id.to_s.start_with?("info") # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https ftp).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#orcid_as_url(orcid) ⇒ Object
402 403 404 |
# File 'lib/bolognese/utils.rb', line 402 def orcid_as_url(orcid) "https://orcid.org/#{orcid}" if orcid.present? end |
#orcid_from_url(url) ⇒ Object
398 399 400 |
# File 'lib/bolognese/utils.rb', line 398 def orcid_from_url(url) Array(/\A:(http|https):\/\/orcid\.org\/(.+)/.match(url)).last end |
#parse_attributes(element, options = {}) ⇒ Object
425 426 427 428 429 430 431 432 433 434 435 436 437 438 |
# File 'lib/bolognese/utils.rb', line 425 def parse_attributes(element, ={}) content = [:content] || "__content__" if element.is_a?(String) element elsif element.is_a?(Hash) element.fetch(content, nil) elsif element.is_a?(Array) a = element.map { |e| e.is_a?(Hash) ? e.fetch(content, nil) : e }.uniq a = [:first] ? a.first : a.unwrap else nil end end |
#sanitize(text, options = {}) ⇒ Object
702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 |
# File 'lib/bolognese/utils.rb', line 702 def sanitize(text, ={}) [:tags] ||= Set.new(%w(strong em b i code pre sub sup br)) content = [:content] || "__content__" custom_scrubber = Bolognese::WhitelistScrubber.new() if text.is_a?(String) Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip elsif text.is_a?(Hash) sanitize(text.fetch(content, nil)) elsif text.is_a?(Array) a = text.map { |e| e.is_a?(Hash) ? sanitize(e.fetch(content, nil)) : sanitize(e) }.uniq a = [:first] ? a.first : a.unwrap else nil end end |
#to_citeproc(element) ⇒ Object
683 684 685 686 687 688 689 690 |
# File 'lib/bolognese/utils.rb', line 683 def to_citeproc(element) Array.wrap(element).map do |a| a["family"] = a["familyName"] a["given"] = a["givenName"] a["literal"] = a["name"] unless a["familyName"].present? a.except("type", "@type", "id", "@id", "name", "familyName", "givenName").compact end.presence end |
#to_datacite_json(element, options = {}) ⇒ Object
532 533 534 535 536 537 |
# File 'lib/bolognese/utils.rb', line 532 def to_datacite_json(element, ={}) a = Array.wrap(element).map do |e| e.inject({}) {|h, (k,v)| h[k.dasherize] = v; h } end [:first] ? a.unwrap : a.presence end |
#to_identifier(identifier) ⇒ Object
661 662 663 664 665 666 |
# File 'lib/bolognese/utils.rb', line 661 def to_identifier(identifier) { "@type" => "PropertyValue", "propertyID" => identifier["relatedIdentifierType"], "value" => identifier["relatedIdentifier"] } end |
#to_ris(element) ⇒ Object
692 693 694 695 696 697 698 699 700 |
# File 'lib/bolognese/utils.rb', line 692 def to_ris(element) Array.wrap(element).map do |a| if a["familyName"].present? [a["familyName"], a["givenName"]].join(", ") else a["name"] end end.unwrap end |
#to_schema_org(element) ⇒ Object
545 546 547 548 549 |
# File 'lib/bolognese/utils.rb', line 545 def to_schema_org(element) mapping = { "type" => "@type", "id" => "@id", "title" => "name" } map_hash_keys(element: element, mapping: mapping) end |
#to_schema_org_container(element, options = {}) ⇒ Object
551 552 553 554 555 556 557 558 |
# File 'lib/bolognese/utils.rb', line 551 def to_schema_org_container(element, ={}) return nil unless (element.is_a?(Hash) || (element.nil? && [:container_title].present?)) { "@id" => element["relatedIdentifier"], "@type" => ([:type] == "Dataset") ? "DataCatalog" : "Periodical", "name" => element["title"] || [:container_title] } end |
#to_schema_org_funder(funding_references) ⇒ Object
600 601 602 603 604 605 606 607 608 609 |
# File 'lib/bolognese/utils.rb', line 600 def to_schema_org_funder(funding_references) return nil unless funding_references.present? Array.wrap(funding_references).map do |fr| { "@id" => fr["funderIdentifier"], "@type" => "Organization", "name" => fr["funderName"] }.compact end.unwrap end |
#to_schema_org_identifier(element, options = {}) ⇒ Object
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 |
# File 'lib/bolognese/utils.rb', line 560 def to_schema_org_identifier(element, ={}) ident = { "@type" => "PropertyValue", "propertyID" => normalize_doi(element) ? "doi" : "url", "value" => element } if [:alternate_identifiers].present? [ident] + Array.wrap([:alternate_identifiers]).map do |ai| if ai["alternateIdentifierType"].to_s.downcase == "url" ai["alternateIdentifier"] else { "@type" => "PropertyValue", "propertyID" => ai["alternateIdentifierType"], "value" => ai["alternateIdentifier"] } end end else ident end end |
#to_schema_org_relation(related_identifiers: nil, relation_type: nil) ⇒ Object
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 |
# File 'lib/bolognese/utils.rb', line 582 def to_schema_org_relation(related_identifiers: nil, relation_type: nil) return nil unless .present? && relation_type.present? relation_type = relation_type == "References" ? ["References", "Cites", "Documents"] : [relation_type] Array.wrap().select { |ri| relation_type.include?(ri["relationType"]) }.map do |r| if r["relatedIdentifierType"] == "ISSN" && r["relationType"] == "IsPartOf" { "@type" => "Periodical", "issn" => r["relatedIdentifier"] }.compact else { "@id" => normalize_id(r["relatedIdentifier"]), "@type" => DC_TO_SO_TRANSLATIONS[r["resourceTypeGeneral"]] || "CreativeWork" }.compact end end.unwrap end |
#to_schema_org_spatial_coverage(geo_location) ⇒ Object
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 |
# File 'lib/bolognese/utils.rb', line 611 def to_schema_org_spatial_coverage(geo_location) return nil unless geo_location.present? Array.wrap(geo_location).map do |gl| if gl.fetch("geoLocationPoint", nil) { "@type" => "Place", "geo" => { "@type" => "GeoCoordinates", "address" => gl["geoLocationPlace"], "latitude" => gl.dig("geoLocationPoint", "pointLatitude"), "longitude" => gl.dig("geoLocationPoint", "pointLongitude") }.compact } elsif gl.fetch("geoLocationBox", nil) { "@type" => "Place", "geo" => { "@type" => "GeoShape", "address" => gl["geoLocationPlace"], "box" => [gl.dig("geoLocationBox", "southBoundLatitude"), gl.dig("geoLocationBox", "westBoundLongitude"), gl.dig("geoLocationBox", "northBoundLatitude"), gl.dig("geoLocationBox", "eastBoundLongitude")].join(" ") }.compact } end end.compact.unwrap end |
#validate_orcid(orcid) ⇒ Object
406 407 408 409 |
# File 'lib/bolognese/utils.rb', line 406 def validate_orcid(orcid) orcid = Array(/\A(?:(http|https):\/\/(www\.)?orcid\.org\/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z/.match(orcid)).last orcid.gsub(/[[:space:]]/, "-") if orcid.present? end |
#validate_orcid_scheme(orcid_scheme) ⇒ Object
411 412 413 |
# File 'lib/bolognese/utils.rb', line 411 def validate_orcid_scheme(orcid_scheme) Array(/\A(http|https):\/\/(www\.)?(orcid\.org)/.match(orcid_scheme)).last end |
#validate_url(str) ⇒ Object
415 416 417 418 419 420 421 422 423 |
# File 'lib/bolognese/utils.rb', line 415 def validate_url(str) if /\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(str) "DOI" elsif /\A(http|https):\/\//.match(str) "URL" elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match(str) "ISSN" end end |