Module: RelatonGb::Scrapper

Included in:
GbScrapper, SecScrapper, TScrapper
Defined in:
lib/relaton_gb/scrapper.rb

Overview

Common scrapping methods.

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ RelatonIsoBib::StructuredIdentifier

Parameters:

  • doc (Nokogiri::HTML::Document)
  • xpt (String) (defaults to: '//dt[text()="标准号"]/following-sibling::dd[1]')

Returns:

  • (RelatonIsoBib::StructuredIdentifier)


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/relaton_gb/scrapper.rb', line 49

def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.at xpt
  unless item_ref
    return RelatonIsoBib::StructuredIdentifier.new(
      project_number: "?", part_number: "?", prefix: nil, id: "?",
      type: "Chinese Standard"
    )
  end

  m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: item_ref.text, type: "Chinese Standard"
  )
end

#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/relaton_gb/scrapper.rb', line 66

def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  gb_en = GbAgencies::Agencies.new("en", {}, "")
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
  name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc)
  entity = RelatonBib::Organization.new name: [
    { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
    { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
  ]
  [{ entity: entity, role: ["publisher"] }]
end

#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • doc (Nokogiri::HTML::Document)
  • xpt (String) (defaults to: '//dt[text()="标准号"]/following-sibling::dd[1]')

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


39
40
41
42
43
44
# File 'lib/relaton_gb/scrapper.rb', line 39

def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.at xpt
  return [] unless item_ref

  [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
end

#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ RelatonBib::DocumentStatus

Parameters:

  • doc (Nokogiri::HTML::Document)
  • xpt (String) (defaults to: ".s-status.label:nth-child(3)")

Returns:

  • (RelatonBib::DocumentStatus)


102
103
104
105
106
107
108
109
110
111
112
# File 'lib/relaton_gb/scrapper.rb', line 102

def get_status(doc, xpt = ".s-status.label:nth-child(3)")
  case doc.at(xpt).text.gsub(/\s/, "")
  when "即将实施"
    stage = "published"
  when "现行"
    stage = "activated"
  when "废止"
    stage = "obsoleted"
  end
  RelatonBib::DocumentStatus.new stage: stage
end

#get_titles(doc) ⇒ Array<Hash>

Returns * :title_intro [String]

  • :title_main [String]

  • :language [String]

  • :script [String].

Parameters:

  • doc (Nokogiri::HTML::Document)

Returns:

  • (Array<Hash>)
    • :title_intro [String]

    • :title_main [String]

    • :language [String]

    • :script [String]



85
86
87
88
89
90
91
92
93
# File 'lib/relaton_gb/scrapper.rb', line 85

def get_titles(doc)
  titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
              language: "zh", script: "Hans" }]
  title_main = doc.css("div.page-header h5").text
  unless title_main.empty?
    titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
  end
  titles
end

#get_type(_doc) ⇒ Object



95
96
97
# File 'lib/relaton_gb/scrapper.rb', line 95

def get_type(_doc)
  "international-standard"
end

#scrapped_data(doc, src:) ⇒ Hash

rubocop:disable Metrics/MethodLength

Parameters:

  • doc (Nokogiri::HTML::Document)
  • src (String)

    url of scrapped page

Returns:

  • (Hash)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/relaton_gb/scrapper.rb', line 16

def scrapped_data(doc, src:)
  {
    committee: get_committee(doc),
    docid: get_docid(doc),
    title: get_titles(doc),
    contributor: get_contributors(doc),
    type: get_type(doc),
    docstatus: get_status(doc),
    gbtype: get_gbtype(doc),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(doc),
  }
end