Module: RelatonIso::Scrapper
- Defined in:
- lib/relaton_iso/scrapper.rb
Overview
Scrapper.
Constant Summary collapse
- DOMAIN =
rubocop:disable Metrics/ModuleLength
"https://www.iso.org"- TYPES =
{ "TS" => "technical-specification", "TR" => "technical-report", "PAS" => "publicly-available-specification", # "AWI" => "approvedWorkItem", # "CD" => "committeeDraft", # "FDIS" => "finalDraftInternationalStandard", # "NP" => "newProposal", # "DIS" => "draftInternationalStandard", # "WD" => "workingDraft", # "R" => "recommendation", "Guide" => "guide", }.freeze
- STGABBR =
{ "00" => "NWIP", "10" => "AWI", "20" => "WD", "30" => "CD", "40" => "DIS", "50" => "FDIS", "60" => { "00" => "PRF", "60" => "FINAL" }, }.freeze
- PUBLISHERS =
{ "IEC" => { name: "International Electrotechnical Commission", url: "www.iec.ch" }, "ISO" => { name: "International Organization for Standardization", url: "www.iso.org" }, "IEEE" => { name: "Institute of Electrical and Electronics Engineers", url: "www.ieee.org" }, "SAE" => { name: "SAE International", url: "www.sae.org" }, "CIE" => { name: " International Commission on Illumination", url: "cie.co.at" }, "ASME" => { name: "American Society of Mechanical Engineers", url: "www.asme.org" }, }.freeze
Class Method Summary collapse
-
.fetch_relaton_docids(doc, pubid) ⇒ Array<RelatonBib::DocumentIdentifier>
Create document ids.
-
.parse_page(hit, lang = nil) ⇒ RelatonIsoBib::IsoBibliographicItem
Parse page.
Class Method Details
.fetch_relaton_docids(doc, pubid) ⇒ Array<RelatonBib::DocumentIdentifier>
Create document ids.
100 101 102 103 104 105 106 |
# File 'lib/relaton_iso/scrapper.rb', line 100 def fetch_relaton_docids(doc, pubid) pubid.urn_stage = stage_code(doc).to_f [ RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true), RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"), ] end |
.parse_page(hit, lang = nil) ⇒ RelatonIsoBib::IsoBibliographicItem
Parse page.
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/relaton_iso/scrapper.rb', line 56 def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity # path = "/contents/data/standard#{hit_data['splitPath']}/" # "#{hit_data['csnumber']}.html" doc, url = get_page "#{hit.hit[:path].sub '/sites/isoorg', ''}.html" # Fetch edition. edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..") &.children&.last&.text&.match(/\d+/)&.to_s hit.pubid.edition = edition if edition titles, abstract, langs = fetch_titles_abstract(doc, lang) RelatonIsoBib::IsoBibliographicItem.new( fetched: Date.today.to_s, docid: fetch_relaton_docids(doc, hit.pubid), docnumber: fetch_docnumber(hit.pubid), edition: edition, language: langs.map { |l| l[:lang] }, script: langs.map { |l| script(l[:lang]) }.uniq, title: titles, doctype: fetch_type(hit.hit[:title]), docstatus: fetch_status(doc), ics: fetch_ics(doc), date: fetch_dates(doc, hit.hit[:title]), contributor: fetch_contributors(hit.hit[:title]), editorialgroup: fetch_workgroup(doc), abstract: abstract, copyright: fetch_copyright(doc), link: fetch_link(doc, url), relation: fetch_relations(doc), place: ["Geneva"], structuredidentifier: fetch_structuredidentifier(hit.pubid), ) end |