Module: RelatonGb::GbScrapper

Extended by:
Scrapper
Defined in:
lib/relaton_gb/gb_scrapper.rb

Overview

National standard scrapper.

Constant Summary collapse

SEARCH_URL =
"https://openstd.samr.gov.cn/bzgk/gb/std_list"
DOC_URL =
"http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno="

Constants included from Scrapper

Scrapper::STAGES

Class Method Summary collapse

Methods included from Scrapper

fetch_structuredidentifier, get_contributors, get_docid, get_status, get_titles, get_type, org, scrapped_data

Class Method Details

.agentObject



34
35
36
# File 'lib/relaton_gb/gb_scrapper.rb', line 34

def agent
  @agent ||= Mechanize.new
end

.get_committee(doc, _ref) ⇒ Hash

Returns * :type [String]

  • :name [String].

Parameters:

  • doc (Nokogiri::HTML)
  • _ref (String)

Returns:

  • (Hash)
    • :type [String]

    • :name [String]



53
54
55
56
# File 'lib/relaton_gb/gb_scrapper.rb', line 53

def get_committee(doc, _ref)
  name = doc.at("//div[contains(., '归口单位') or contains(., '归口部门')]/following-sibling::div")
  { type: "technical", name: name.text.delete("\r\n\t\t") }
end

.scrape_doc(hit) ⇒ RelatonGb::GbBibliographicItem

Parameters:

Returns:



40
41
42
43
44
45
46
# File 'lib/relaton_gb/gb_scrapper.rb', line 40

def scrape_doc(hit)
  src = DOC_URL + hit.pid
  doc = agent.get src
  GbBibliographicItem.new(**scrapped_data(doc, src, hit))
rescue Mechanize::Error => e
  raise RelatonBib::RequestError, e.message
end

.scrape_page(text) ⇒ RelatonGb::HitCollection

Parameters:

  • text (Strin)

    code of standard for serarch

Returns:



19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/relaton_gb/gb_scrapper.rb', line 19

def scrape_page(text) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  doc = agent.get("#{SEARCH_URL}?p.p2=#{CGI.escape(text)}")
  hits = doc.xpath(
    "//table[contains(@class, 'result_list')]/tbody[2]/tr",
  ).map do |h|
    ref = h.at "./td[2]/a"
    pid = ref[:onclick].match(/[0-9A-F]+/).to_s
    rdate = h.at("./td[7]").text
    Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
  end
  HitCollection.new hits.sort_by(&:release_date).reverse
rescue Mechanize::Error => e
  raise RelatonBib::RequestError, e.message
end