Class: RelatonIso::Scrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_iso/scrapper.rb

Overview

Scrapper.

Constant Summary collapse

DOMAIN =

rubocop:disable Metrics/ModuleLength

"https://www.iso.org"
TYPES =
{
  "TS" => "technical-specification",
  "DTS" => "technical-specification",
  "TR" => "technical-report",
  "DTR" => "technical-report",
  "PAS" => "publicly-available-specification",
  # "AWI" => "approvedWorkItem",
  # "CD" => "committeeDraft",
  # "FDIS" => "finalDraftInternationalStandard",
  # "NP" => "newProposal",
  # "DIS" => "draftInternationalStandard",
  # "WD" => "workingDraft",
  # "R" => "recommendation",
  "Guide" => "guide",
  "ISO" => "international-standard",
  "IEC" => "international-standard",
  "IWA" => "international-workshop-agreement",
}.freeze
STGABBR =
{
  "00" => "NWIP",
  "10" => "AWI",
  "20" => "WD",
  "30" => "CD",
  "40" => "DIS",
  "50" => "FDIS",
  "60" => { "00" => "PRF", "60" => "FINAL" },
}.freeze
PUBLISHERS =
{
  "IEC" => { name: "International Electrotechnical Commission",
             url: "www.iec.ch" },
  "ISO" => { name: "International Organization for Standardization",
             url: "www.iso.org" },
  "IEEE" => { name: "Institute of Electrical and Electronics Engineers",
              url: "www.ieee.org" },
  "SAE" => { name: "SAE International", url: "www.sae.org" },
  "CIE" => { name: " International Commission on Illumination",
             url: "cie.co.at" },
  "ASME" => { name: "American Society of Mechanical Engineers",
              url: "www.asme.org" },
}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang, errors) ⇒ Scrapper

extend self



53
54
55
56
# File 'lib/relaton_iso/scrapper.rb', line 53

def initialize(lang, errors)
  @lang = lang
  @errors = errors
end

Class Method Details

.parse_page(path, lang: nil, errors: {}) ⇒ RelatonIsoBib::IsoBibliographicItem

Parse page.

Parameters:

  • path (String)

    page path

  • lang (String, nil) (defaults to: nil)

    language

  • errors (Hash) (defaults to: {})

    collection of parsing errors

Returns:

  • (RelatonIsoBib::IsoBibliographicItem)


63
64
65
# File 'lib/relaton_iso/scrapper.rb', line 63

def self.parse_page(path, lang: nil, errors: {})
  new(lang, errors).parse(path)
end

Instance Method Details

#editionObject



111
112
113
114
115
116
117
# File 'lib/relaton_iso/scrapper.rb', line 111

def edition
  return @edition if defined?(@edition)

  ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
  @errors[:edition] &&= ed.nil?
  @edition = ed && ed.text.match(/\d+$/).to_s
end

#fetch_relaton_docidsArray<RelatonBib::DocumentIdentifier>

Create document ids.

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


124
125
126
127
128
129
130
131
# File 'lib/relaton_iso/scrapper.rb', line 124

def fetch_relaton_docids
  pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
  [
    DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
    RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
    DocumentIdentifier.new(id: pubid, type: "URN"),
  ]
end

#idObject



93
94
95
96
97
98
99
# File 'lib/relaton_iso/scrapper.rb', line 93

def id
  return @id if defined?(@id)

  did = @doc.at("//h1/span[1]")
  @errors[:id] &&= did.nil?
  @id = did && did.text.split(" | ").first.strip
end

#isorefString

Create ISO reference identifier with English language.

Returns:

  • (String)

    English reference identifier



138
139
140
141
# File 'lib/relaton_iso/scrapper.rb', line 138

def isoref
  params = pubid.to_h.reject { |k, _| k == :typed_stage }
  Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
end

#parse(path) ⇒ Object

rubocop:disable Metrics/AbcSize,Metrics/MethodLength



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/relaton_iso/scrapper.rb', line 67

def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  @doc, @url = get_page path
  titles, abstract, langs = fetch_titles_abstract

  RelatonIsoBib::IsoBibliographicItem.new(
    docid: fetch_relaton_docids,
    docnumber: fetch_docnumber,
    edition: edition,
    language: langs.map { |l| l[:lang] },
    script: langs.map { |l| script(l[:lang]) }.uniq,
    title: titles,
    doctype: fetch_type,
    docstatus: fetch_status,
    ics: fetch_ics,
    date: fetch_dates,
    contributor: fetch_contributors,
    editorialgroup: fetch_workgroup,
    abstract: abstract,
    copyright: fetch_copyright,
    link: fetch_link(@url),
    relation: fetch_relations,
    place: ["Geneva"],
    structuredidentifier: fetch_structuredidentifier,
  )
end

#pubidObject



101
102
103
104
105
106
107
108
109
# File 'lib/relaton_iso/scrapper.rb', line 101

def pubid
  return @pubid if @pubid

  @pubid = Pubid::Iso::Identifier.parse(id)
  @pubid.root.edition ||= edition if @pubid.base
  @pubid
rescue StandardError => e
  Util.error "Failed to parse pubid from #{id}: #{e.message}"
end