Module: Bolognese::Utils
- Included in:
- CLI, MetadataUtils
- Defined in:
- lib/bolognese/utils.rb
Constant Summary collapse
- LICENSE_NAMES =
{ "http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)", "http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)", "http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)", "http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)", "http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)", "http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)" }
- DC_TO_SO_TRANSLATIONS =
{ "Audiovisual" => "MediaObject", "Collection" => "Collection", "Dataset" => "Dataset", "Event" => "Event", "Image" => "ImageObject", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => "Service", "Software" => "SoftwareSourceCode", "Sound" => "AudioObject", "Text" => "ScholarlyArticle", "Workflow" => nil, "Other" => "CreativeWork" }
- DC_TO_CP_TRANSLATIONS =
{ "Audiovisual" => "motion_picture", "Collection" => nil, "Dataset" => "dataset", "Event" => nil, "Image" => "graphic", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Sound" => "song", "Text" => "report", "Workflow" => nil, "Other" => nil }
- CR_TO_CP_TRANSLATIONS =
{ "Proceedings" => nil, "ReferenceBook" => nil, "JournalIssue" => nil, "ProceedingsArticle" => "paper-conference", "Other" => nil, "Dissertation" => "thesis", "Dataset" => "dataset", "EditedBook" => "book", "JournalArticle" => "article-journal", "Journal" => nil, "Report" => "report", "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "chapter", "BookPart" => nil, "Book" => "book", "BookChapter" => "chapter", "StandardSeries" => nil, "Monograph" => "book", "Component" => nil, "ReferenceEntry" => "entry-dictionary", "JournalVolume" => nil, "BookSet" => nil }
- CR_TO_SO_TRANSLATIONS =
{ "Proceedings" => nil, "ReferenceBook" => "Book", "JournalIssue" => "PublicationIssue", "ProceedingsArticle" => nil, "Other" => "CreativeWork", "Dissertation" => "Thesis", "Dataset" => "Dataset", "EditedBook" => "Book", "JournalArticle" => "ScholarlyArticle", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => nil, "BookPart" => nil, "Book" => "Book", "BookChapter" => "Chapter", "StandardSeries" => nil, "Monograph" => "Book", "Component" => "CreativeWork", "ReferenceEntry" => nil, "JournalVolume" => "PublicationVolume", "BookSet" => nil, "PostedContent" => "ScholarlyArticle" }
- CR_TO_BIB_TRANSLATIONS =
{ "Proceedings" => "proceedings", "ReferenceBook" => "book", "JournalIssue" => nil, "ProceedingsArticle" => nil, "Other" => nil, "Dissertation" => "phdthesis", "Dataset" => nil, "EditedBook" => "book", "JournalArticle" => "article", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "inbook", "BookPart" => nil, "Book" => "book", "BookChapter" => "inbook", "StandardSeries" => nil, "Monograph" => "book", "Component" => nil, "ReferenceEntry" => nil, "JournalVolume" => nil, "BookSet" => nil, "PostedContent" => "article" }
- BIB_TO_CR_TRANSLATIONS =
{ "proceedings" => "Proceedings", "phdthesis" => "Dissertation", "article" => "JournalArticle", "book" => "Book", "inbook" => "BookChapter" }
- CR_TO_JATS_TRANSLATIONS =
{ "Proceedings" => "working-paper", "ReferenceBook" => "book", "JournalIssue" => "journal", "ProceedingsArticle" => "working-paper", "Other" => nil, "Dissertation" => nil, "Dataset" => "data", "EditedBook" => "book", "JournalArticle" => "journal", "Journal" => "journal", "Report" => "report", "BookSeries" => "book", "ReportSeries" => "report", "BookTrack" => "book", "Standard" => "standard", "BookSection" => "chapter", "BookPart" => "chapter", "Book" => "book", "BookChapter" => "chapter", "StandardSeries" => "standard", "Monograph" => "book", "Component" => nil, "ReferenceEntry" => nil, "JournalVolume" => "journal", "BookSet" => "book" }
- SO_TO_DC_TRANSLATIONS =
{ "Article" => "Text", "AudioObject" => "Sound", "Blog" => "Text", "BlogPosting" => "Text", "Chapter" => "Text", "Collection" => "Collection", "CreativeWork" => "Other", "DataCatalog" => "Dataset", "Dataset" => "Dataset", "Event" => "Event", "ImageObject" => "Image", "Movie" => "Audiovisual", "PublicationIssue" => "Text", "ScholarlyArticle" => "Text", "Thesis" => "Text", "Service" => "Service", "SoftwareSourceCode" => "Software", "VideoObject" => "Audiovisual", "WebPage" => "Text", "WebSite" => "Text" }
- SO_TO_JATS_TRANSLATIONS =
{ "Article" => "journal", "AudioObject" => nil, "Blog" => nil, "BlogPosting" => nil, "Book" => "book", "Collection" => nil, "CreativeWork" => nil, "DataCatalog" => "data", "Dataset" => "data", "Event" => nil, "ImageObject" => nil, "Movie" => nil, "PublicationIssue" => "journal", "ScholarlyArticle" => "journal", "Service" => nil, "SoftwareSourceCode" => "software", "VideoObject" => nil, "WebPage" => nil, "WebSite" => "website" }
- SO_TO_CP_TRANSLATIONS =
{ "Article" => "", "AudioObject" => "song", "Blog" => "report", "BlogPosting" => "post-weblog", "Collection" => nil, "CreativeWork" => nil, "DataCatalog" => "dataset", "Dataset" => "dataset", "Event" => nil, "ImageObject" => "graphic", "Movie" => "motion_picture", "PublicationIssue" => nil, "ScholarlyArticle" => "article-journal", "Service" => nil, "Thesis" => "thesis", "VideoObject" => "broadcast", "WebPage" => "webpage", "WebSite" => "webpage" }
- SO_TO_RIS_TRANSLATIONS =
{ "Article" => nil, "AudioObject" => nil, "Blog" => nil, "BlogPosting" => "BLOG", "Collection" => nil, "CreativeWork" => "GEN", "DataCatalog" => "CTLG", "Dataset" => "DATA", "Event" => nil, "ImageObject" => "FIGURE", "Movie" => "MPCT", "PublicationIssue" => nil, "ScholarlyArticle" => "JOUR", "Service" => nil, "SoftwareSourceCode" => "COMP", "VideoObject" => "VIDEO", "WebPage" => "ELEC", "WebSite" => nil }
- CR_TO_RIS_TRANSLATIONS =
{ "Proceedings" => "CONF", "ReferenceBook" => "BOOK", "JournalIssue" => nil, "ProceedingsArticle" => "CPAPER", "Other" => "GEN", "Dissertation" => "THES", "Dataset" => "DATA", "EditedBook" => "BOOK", "JournalArticle" => "JOUR", "Journal" => nil, "Report" => nil, "BookSeries" => nil, "ReportSeries" => nil, "BookTrack" => nil, "Standard" => nil, "BookSection" => "CHAP", "BookPart" => "CHAP", "Book" => "BOOK", "BookChapter" => "CHAP", "StandardSeries" => nil, "Monograph" => "BOOK", "Component" => nil, "ReferenceEntry" => "DICT", "JournalVolume" => nil, "BookSet" => nil }
- DC_TO_RIS_TRANSLATIONS =
{ "Audiovisual" => "MPCT", "Collection" => nil, "Dataset" => "DATA", "Event" => nil, "Image" => "FIGURE", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Software" => "COMP", "Sound" => "SOUND", "Text" => "RPRT", "Workflow" => nil, "Other" => nil }
- SO_TO_BIB_TRANSLATIONS =
{ "Article" => "article", "AudioObject" => "misc", "Thesis" => "phdthesis", "Blog" => "misc", "BlogPosting" => "article", "Collection" => "misc", "CreativeWork" => "misc", "DataCatalog" => "misc", "Dataset" => "misc", "Event" => "misc", "ImageObject" => "misc", "Movie" => "misc", "PublicationIssue" => "misc", "ScholarlyArticle" => "article", "Service" => "misc", "SoftwareSourceCode" => "misc", "VideoObject" => "misc", "WebPage" => "misc", "WebSite" => "misc" }
Instance Method Summary collapse
- #find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
- #find_from_format_by_ext(string, options = {}) ⇒ Object
- #find_from_format_by_id(id) ⇒ Object
- #find_from_format_by_string(string) ⇒ Object
- #from_citeproc(element) ⇒ Object
- #from_schema_org(element) ⇒ Object
- #get_date_from_date_parts(date_as_parts) ⇒ Object
- #get_date_from_parts(year, month = nil, day = nil) ⇒ Object
- #get_date_parts(iso8601_time) ⇒ Object
- #get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object
-
#get_datetime_from_iso8601(iso8601_time) ⇒ Object
parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library return nil if invalid iso8601 timestamp.
- #get_year_month(iso8601_time) ⇒ Object
- #get_year_month_day(iso8601_time) ⇒ Object
- #github_as_codemeta_url(url) ⇒ Object
- #github_as_owner_url(url) ⇒ Object
- #github_as_release_url(url) ⇒ Object
- #github_as_repo_url(url) ⇒ Object
- #github_from_url(url) ⇒ Object
- #github_owner_from_url(url) ⇒ Object
- #github_release_from_url(url) ⇒ Object
- #github_repo_from_url(url) ⇒ Object
- #jsonlint(json) ⇒ Object
- #map_hash_keys(element: nil, mapping: nil) ⇒ Object
- #normalize_id(id, options = {}) ⇒ Object
- #normalize_ids(ids: nil) ⇒ Object
-
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name.
- #normalize_orcid(orcid) ⇒ Object
- #normalize_url(id) ⇒ Object
- #orcid_as_url(orcid) ⇒ Object
- #orcid_from_url(url) ⇒ Object
- #parse_attributes(element, options = {}) ⇒ Object
- #sanitize(text, options = {}) ⇒ Object
- #to_citeproc(element) ⇒ Object
- #to_ris(element) ⇒ Object
- #to_schema_org(element) ⇒ Object
- #to_schema_org_container(element, options = {}) ⇒ Object
- #to_schema_org_identifier(element, options = {}) ⇒ Object
- #validate_orcid(orcid) ⇒ Object
- #validate_orcid_scheme(orcid_scheme) ⇒ Object
- #validate_url(str) ⇒ Object
Instance Method Details
#find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
323 324 325 326 327 328 329 330 331 332 333 |
# File 'lib/bolognese/utils.rb', line 323 def find_from_format(id: nil, string: nil, ext: nil) if id.present? find_from_format_by_id(id) elsif ext.present? find_from_format_by_ext(string, ext: ext) elsif string.present? find_from_format_by_string(string) else "datacite" end end |
#find_from_format_by_ext(string, options = {}) ⇒ Object
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
# File 'lib/bolognese/utils.rb', line 350 def find_from_format_by_ext(string, ={}) if [:ext] == ".bib" "bibtex" elsif [:ext] == ".ris" "ris" elsif [:ext] == ".xml" && Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref") "crossref" elsif [:ext] == ".xml" && Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") } "datacite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org") "schema_org" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("ris_type") "crosscite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" end end |
#find_from_format_by_id(id) ⇒ Object
335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# File 'lib/bolognese/utils.rb', line 335 def find_from_format_by_id(id) id = normalize_id(id) if /\A(?:(http|https):\/(\/)?(dx\.)?(doi.org|handle.test.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id) ra = get_doi_ra(id) %w(DataCite Crossref).include?(ra) ? ra.downcase : nil elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id) "orcid" elsif /\A(http|https):\/(\/)?github\.com\/(.+)\z/.match(id) "codemeta" else "schema_org" end end |
#find_from_format_by_string(string) ⇒ Object
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
# File 'lib/bolognese/utils.rb', line 372 def find_from_format_by_string(string) if Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref").present? "crossref" elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") } "datacite" elsif Maremma.from_json(string).to_h.dig("ris_type").present? "crosscite" elsif Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org") "schema_org" elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" elsif Maremma.from_json(string).to_h.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" elsif string.start_with?("TY - ") "ris" elsif BibTeX.parse(string).first "bibtex" end end |
#from_citeproc(element) ⇒ Object
579 580 581 582 583 584 585 586 587 588 589 590 591 592 |
# File 'lib/bolognese/utils.rb', line 579 def from_citeproc(element) Array.wrap(element).map do |a| if a["literal"].present? a["@type"] = "Organization" a["name"] = a["literal"] else a["@type"] = "Person" a["name"] = [a["given"], a["family"]].compact.join(" ") end a["givenName"] = a["given"] a["familyName"] = a["family"] a.except("given", "family", "literal").compact end.unwrap end |
#from_schema_org(element) ⇒ Object
559 560 561 562 563 |
# File 'lib/bolognese/utils.rb', line 559 def from_schema_org(element) mapping = { "@type" => "type", "@id" => "id" } map_hash_keys(element: element, mapping: mapping) end |
#get_date_from_date_parts(date_as_parts) ⇒ Object
687 688 689 690 691 |
# File 'lib/bolognese/utils.rb', line 687 def get_date_from_date_parts(date_as_parts) date_parts = date_as_parts.fetch("date-parts", []).first year, month, day = date_parts[0], date_parts[1], date_parts[2] get_date_from_parts(year, month, day) end |
#get_date_from_parts(year, month = nil, day = nil) ⇒ Object
693 694 695 |
# File 'lib/bolognese/utils.rb', line 693 def get_date_from_parts(year, month = nil, day = nil) [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject { |part| part == "00" }.join("-") end |
#get_date_parts(iso8601_time) ⇒ Object
678 679 680 681 682 683 684 685 |
# File 'lib/bolognese/utils.rb', line 678 def get_date_parts(iso8601_time) return { 'date-parts' => [[]] } if iso8601_time.nil? year = iso8601_time[0..3].to_i month = iso8601_time[5..6].to_i day = iso8601_time[8..9].to_i { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] } end |
#get_date_parts_from_parts(year, month = nil, day = nil) ⇒ Object
697 698 699 |
# File 'lib/bolognese/utils.rb', line 697 def get_date_parts_from_parts(year, month = nil, day = nil) { 'date-parts' => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] } end |
#get_datetime_from_iso8601(iso8601_time) ⇒ Object
parsing of incomplete iso8601 timestamps such as 2015-04 is broken in standard library return nil if invalid iso8601 timestamp
723 724 725 726 727 |
# File 'lib/bolognese/utils.rb', line 723 def get_datetime_from_iso8601(iso8601_time) ISO8601::DateTime.new(iso8601_time).to_time.utc rescue nil end |
#get_year_month(iso8601_time) ⇒ Object
701 702 703 704 705 706 707 708 |
# File 'lib/bolognese/utils.rb', line 701 def get_year_month(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] [year.to_i, month.to_i].reject { |part| part == 0 } end |
#get_year_month_day(iso8601_time) ⇒ Object
710 711 712 713 714 715 716 717 718 |
# File 'lib/bolognese/utils.rb', line 710 def get_year_month_day(iso8601_time) return [] if iso8601_time.nil? year = iso8601_time[0..3] month = iso8601_time[5..6] day = iso8601_time[8..9] [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 } end |
#github_as_codemeta_url(url) ⇒ Object
668 669 670 671 672 673 674 675 676 |
# File 'lib/bolognese/utils.rb', line 668 def (url) github_hash = github_from_url(url) if github_hash[:path].to_s.end_with?("codemeta.json") "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/#{github_hash[:release]}/#{github_hash[:path]}" elsif github_hash[:owner].present? "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json" end end |
#github_as_owner_url(url) ⇒ Object
653 654 655 656 |
# File 'lib/bolognese/utils.rb', line 653 def github_as_owner_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present? end |
#github_as_release_url(url) ⇒ Object
663 664 665 666 |
# File 'lib/bolognese/utils.rb', line 663 def github_as_release_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" if github_hash[:release].present? end |
#github_as_repo_url(url) ⇒ Object
658 659 660 661 |
# File 'lib/bolognese/utils.rb', line 658 def github_as_repo_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" if github_hash[:repo].present? end |
#github_from_url(url) ⇒ Object
630 631 632 633 634 635 636 637 638 639 |
# File 'lib/bolognese/utils.rb', line 630 def github_from_url(url) return {} unless /\Ahttps:\/\/github\.com\/(.+)(?:\/)?(.+)?(?:\/tree\/)?(.*)\z/.match(url) words = URI.parse(url).path[1..-1].split('/') path = words.length > 3 ? words[4...words.length].join("/") : nil { owner: words[0], repo: words[1], release: words[3], path: path }.compact end |
#github_owner_from_url(url) ⇒ Object
649 650 651 |
# File 'lib/bolognese/utils.rb', line 649 def github_owner_from_url(url) github_from_url(url).fetch(:owner, nil) end |
#github_release_from_url(url) ⇒ Object
645 646 647 |
# File 'lib/bolognese/utils.rb', line 645 def github_release_from_url(url) github_from_url(url).fetch(:release, nil) end |
#github_repo_from_url(url) ⇒ Object
641 642 643 |
# File 'lib/bolognese/utils.rb', line 641 def github_repo_from_url(url) github_from_url(url).fetch(:repo, nil) end |
#jsonlint(json) ⇒ Object
729 730 731 732 733 734 735 736 |
# File 'lib/bolognese/utils.rb', line 729 def jsonlint(json) return ["No JSON provided"] unless json.present? error_array = [] linter = JsonLint::Linter.new linter.send(:check_data, json, error_array) error_array end |
#map_hash_keys(element: nil, mapping: nil) ⇒ Object
565 566 567 568 569 570 571 572 573 574 575 576 577 |
# File 'lib/bolognese/utils.rb', line 565 def map_hash_keys(element: nil, mapping: nil) Array.wrap(element).map do |a| a.map {|k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)| if v.is_a?(Hash) hsh[k] = to_schema_org(v) hsh else hsh[k] = v hsh end end end.unwrap end |
#normalize_id(id, options = {}) ⇒ Object
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 |
# File 'lib/bolognese/utils.rb', line 436 def normalize_id(id, ={}) return nil unless id.present? # check for valid DOI doi = normalize_doi(id, ) return doi if doi.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#normalize_ids(ids: nil) ⇒ Object
474 475 476 477 478 479 480 |
# File 'lib/bolognese/utils.rb', line 474 def normalize_ids(ids: nil) Array.wrap(ids).map do |id| { "id" => normalize_id(id["@id"]), "type" => id["@type"] || Metadata::DC_TO_SO_TRANSLATIONS[id["resourceTypeGeneral"]] || "CreativeWork", "title" => id["title"] || id["name"] }.compact end.unwrap end |
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 |
# File 'lib/bolognese/utils.rb', line 483 def normalize_licenses(licenses) standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] } return licenses unless standard_licenses.present? # use HTTPS uri.scheme = "https" # use host name without subdomain uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last # normalize URLs if uri.host == "creativecommons.org" uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode" uri.path << '/' unless uri.path.end_with?('/') else uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '') uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase } uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize } uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do m = Regexp.last_match text = m[1] if m[3].present? version = [m[3], m[5].presence || "0"].join(".") [text, version].join("-") else text end end end uri.to_s rescue URI::InvalidURIError nil end |
#normalize_orcid(orcid) ⇒ Object
466 467 468 469 470 471 472 |
# File 'lib/bolognese/utils.rb', line 466 def normalize_orcid(orcid) orcid = validate_orcid(orcid) return nil unless orcid.present? # turn ORCID ID into URL "http://orcid.org/" + Addressable::URI.encode(orcid) end |
#normalize_url(id) ⇒ Object
453 454 455 456 457 458 459 460 461 462 463 464 |
# File 'lib/bolognese/utils.rb', line 453 def normalize_url(id) return nil unless id.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#orcid_as_url(orcid) ⇒ Object
398 399 400 |
# File 'lib/bolognese/utils.rb', line 398 def orcid_as_url(orcid) "https://orcid.org/#{orcid}" if orcid.present? end |
#orcid_from_url(url) ⇒ Object
394 395 396 |
# File 'lib/bolognese/utils.rb', line 394 def orcid_from_url(url) Array(/\A:(http|https):\/\/orcid\.org\/(.+)/.match(url)).last end |
#parse_attributes(element, options = {}) ⇒ Object
421 422 423 424 425 426 427 428 429 430 431 432 433 434 |
# File 'lib/bolognese/utils.rb', line 421 def parse_attributes(element, ={}) content = [:content] || "__content__" if element.is_a?(String) element elsif element.is_a?(Hash) element.fetch(content, nil) elsif element.is_a?(Array) a = element.map { |e| e.is_a?(Hash) ? e.fetch(content, nil) : e }.uniq a = [:first] ? a.first : a.unwrap else nil end end |
#sanitize(text, options = {}) ⇒ Object
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 |
# File 'lib/bolognese/utils.rb', line 613 def sanitize(text, ={}) [:tags] ||= Set.new(%w(strong em b i code pre sub sup br)) content = [:content] || "__content__" custom_scrubber = Bolognese::WhitelistScrubber.new() if text.is_a?(String) Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip elsif text.is_a?(Hash) sanitize(text.fetch(content, nil)) elsif text.is_a?(Array) a = text.map { |e| e.is_a?(Hash) ? sanitize(e.fetch(content, nil)) : sanitize(e) }.uniq a = [:first] ? a.first : a.unwrap else nil end end |
#to_citeproc(element) ⇒ Object
594 595 596 597 598 599 600 601 |
# File 'lib/bolognese/utils.rb', line 594 def to_citeproc(element) Array.wrap(element).map do |a| a["family"] = a["familyName"] a["given"] = a["givenName"] a["literal"] = a["name"] unless a["familyName"].present? a.except("type", "@type", "id", "@id", "name", "familyName", "givenName").compact end.presence end |
#to_ris(element) ⇒ Object
603 604 605 606 607 608 609 610 611 |
# File 'lib/bolognese/utils.rb', line 603 def to_ris(element) Array.wrap(element).map do |a| if a["familyName"].present? [a["familyName"], a["givenName"]].join(", ") else a["name"] end end.unwrap end |
#to_schema_org(element) ⇒ Object
519 520 521 522 523 |
# File 'lib/bolognese/utils.rb', line 519 def to_schema_org(element) mapping = { "type" => "@type", "id" => "@id", "title" => "name" } map_hash_keys(element: element, mapping: mapping) end |
#to_schema_org_container(element, options = {}) ⇒ Object
525 526 527 528 529 530 531 532 533 534 535 |
# File 'lib/bolognese/utils.rb', line 525 def to_schema_org_container(element, ={}) return nil unless (element.is_a?(Hash) || (element.nil? && [:container_title].present?)) mapping = { "type" => "@type", "id" => "@id", "title" => "name" } element ||= {} element["type"] = ([:type] == "Dataset") ? "DataCatalog" : "Periodical" element["title"] ||= [:container_title] map_hash_keys(element: element, mapping: mapping) end |
#to_schema_org_identifier(element, options = {}) ⇒ Object
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 |
# File 'lib/bolognese/utils.rb', line 537 def to_schema_org_identifier(element, ={}) ident = { "@type" => "PropertyValue", "propertyID" => normalize_doi(element) ? "doi" : "url", "value" => element } if [:alternate_identifier].present? [ident] + Array.wrap([:alternate_identifier]).map do |ai| if ai["type"].to_s.downcase == "url" ai["name"] else { "@type" => "PropertyValue", "propertyID" => ai["type"], "value" => ai["name"] } end end else ident end end |
#validate_orcid(orcid) ⇒ Object
402 403 404 405 |
# File 'lib/bolognese/utils.rb', line 402 def validate_orcid(orcid) orcid = Array(/\A(?:(http|https):\/\/(www\.)?orcid\.org\/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z/.match(orcid)).last orcid.gsub(/[[:space:]]/, "-") if orcid.present? end |
#validate_orcid_scheme(orcid_scheme) ⇒ Object
407 408 409 |
# File 'lib/bolognese/utils.rb', line 407 def validate_orcid_scheme(orcid_scheme) Array(/\A(http|https):\/\/(www\.)?(orcid\.org)/.match(orcid_scheme)).last end |
#validate_url(str) ⇒ Object
411 412 413 414 415 416 417 418 419 |
# File 'lib/bolognese/utils.rb', line 411 def validate_url(str) if /\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(str) "DOI" elsif /\A(http|https):\/\//.match(str) "URL" elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match(str) "ISSN" end end |