Module: RelatonGb::Scrapper

Included in:
GbScrapper, SecScrapper, TScrapper
Defined in:
lib/relaton_gb/scrapper.rb

Overview

Common scrapping methods.

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier

Parameters:

  • docref (String)

Returns:

  • (RelatonIsoBib::StructuredIdentifier)


46
47
48
49
50
51
52
# File 'lib/relaton_gb/scrapper.rb', line 46

def fetch_structuredidentifier(docref)
  m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: docref, type: "Chinese Standard"
  )
end

#get_contributors(doc, docref) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::HTML::Document)
  • docref (Strings)

Returns:

  • (Array<Hash>)


57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/relaton_gb/scrapper.rb', line 57

def get_contributors(doc, docref)
  gb_en = GbAgencies::Agencies.new("en", {}, "")
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
  name = docref.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc, docref)
  entity = RelatonBib::Organization.new name: [
    { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
    { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
  ]
  [{ entity: entity, role: [type: "publisher"] }]
end

#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • docref (String)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


40
41
42
# File 'lib/relaton_gb/scrapper.rb', line 40

def get_docid(docref)
  [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
end

#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus

Parameters:

  • doc (Nokogiri::HTML::Document)
  • status (String, NilClass) (defaults to: nil)

Returns:

  • (RelatonBib::DocumentStatus)


93
94
95
96
97
98
99
100
# File 'lib/relaton_gb/scrapper.rb', line 93

def get_status(doc, status = nil)
  stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text
          when "即将实施" then "published"
          when "现行" then "activated"
          when "废止" then "obsoleted"
          end
  RelatonBib::DocumentStatus.new stage: stage
end

#get_titles(doc) ⇒ Array<Hash>

Returns * :title_intro [String]

  • :title_main [String]

  • :language [String]

  • :script [String].

Parameters:

  • doc (Nokogiri::HTML::Document)

Returns:

  • (Array<Hash>)
    • :title_intro [String]

    • :title_main [String]

    • :language [String]

    • :script [String]



76
77
78
79
80
81
82
83
84
# File 'lib/relaton_gb/scrapper.rb', line 76

def get_titles(doc)
  titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text,
              title_intro: nil, language: "zh", script: "Hans" }]
  title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  unless title_main.empty?
    titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
  end
  titles
end

#get_typeObject



86
87
88
# File 'lib/relaton_gb/scrapper.rb', line 86

def get_type
  "standard"
end

#scrapped_data(doc, src, hit) ⇒ Hash

rubocop:disable Metrics/MethodLength

Parameters:

Returns:

  • (Hash)


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/relaton_gb/scrapper.rb', line 17

def scrapped_data(doc, src, hit)
  {
    fetched: Date.today.to_s,
    committee: get_committee(doc, hit.docref),
    docid: get_docid(hit.docref),
    title: get_titles(doc),
    contributor: get_contributors(doc, hit.docref),
    doctype: get_type,
    docstatus: get_status(doc, hit.status),
    gbtype: get_gbtype(doc, hit.docref),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(hit.docref),
  }
end