Module: RelatonIso::Scrapper

Defined in:
lib/relaton_iso/scrapper.rb

Overview

Scrapper. rubocop:disable Metrics/ModuleLength

Constant Summary collapse

DOMAIN =
"https://www.iso.org"
TYPES =
{
  "TS" => "technical-specification",
  "TR" => "technical-report",
  "PAS" => "publicly-available-specification",
  # "AWI" => "approvedWorkItem",
  # "CD" => "committeeDraft",
  # "FDIS" => "finalDraftInternationalStandard",
  # "NP" => "newProposal",
  # "DIS" => "draftInternationalStandard",
  # "WD" => "workingDraft",
  # "R" => "recommendation",
  "Guide" => "guide",
}.freeze
STGABBR =
{
  "00" => "NWIP",
  "10" => "AWI",
  "20" => "WD",
  "30" => "CD",
  "40" => "DIS",
  "50" => "FDIS",
  "60" => { "00" => "PRF", "60" => "FINAL" },
}.freeze
PUBLISHERS =
{
  "IEC" => { name: "International Electrotechnical Commission",
             url: "www.iec.ch" },
  "ISO" => { name: "International Organization for Standardization",
             url: "www.iso.org" },
  "IEEE" => { name: "Institute of Electrical and Electronics Engineers",
              url: "www.ieee.org" },
  "SAE" => { name: "SAE International", url: "www.sae.org" },
  "CIE" => { name: " International Commission on Illumination",
             url: "cie.co.at" },
  "ASME" => { name: "American Society of Mechanical Engineers",
              url: "www.asme.org" },
}.freeze

Class Method Summary collapse

Class Method Details

.parse_page(hit_data, lang = nil) ⇒ Hash

Parse page.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/relaton_iso/scrapper.rb', line 57

def parse_page(hit_data, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  path = "/contents/data/standard#{hit_data['splitPath']}/"\
  "#{hit_data['csnumber']}.html"
  doc, url = get_page path

  # Fetch edition.
  edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")
    &.children&.last&.text&.match(/\d+/)&.to_s

  titles, abstract, langs = fetch_titles_abstract(doc, lang)

  RelatonIsoBib::IsoBibliographicItem.new(
    fetched: Date.today.to_s,
    docid: fetch_docid(hit_data, langs),
    docnumber: fetch_docnumber(doc),
    edition: edition,
    language: langs.map { |l| l[:lang] },
    script: langs.map { |l| script(l[:lang]) }.uniq,
    title: titles,
    doctype: fetch_type(hit_data["docRef"]),
    docstatus: fetch_status(doc),
    ics: fetch_ics(doc),
    date: fetch_dates(doc, hit_data["docRef"]),
    contributor: fetch_contributors(hit_data["docRef"]),
    editorialgroup: fetch_workgroup(doc),
    abstract: abstract,
    copyright: fetch_copyright(hit_data["docRef"], doc),
    link: fetch_link(doc, url),
    relation: fetch_relations(doc),
    place: ["Geneva"],
    structuredidentifier: fetch_structuredidentifier(doc)
  )
end