Module: RelatonGb::Scrapper

Included in:: GbScrapper, SecScrapper, TScrapper

Defined in:: lib/relaton_gb/scrapper.rb

Overview

Common scrapping methods.

Instance Method Summary collapse

#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ RelatonIsoBib::StructuredIdentifier
#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Object
#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Array<RelatonBib::DocumentIdentifier>
#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ RelatonBib::DocumentStatus
#get_titles(doc) ⇒ Array<Hash>
- :title_intro [String] * :title_main [String] * :language [String] * :script [String].
#get_type(_doc) ⇒ Object
#scrapped_data(doc, src:) ⇒ Hash

rubocop:disable Metrics/MethodLength.

Instance Method Details

#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `RelatonIsoBib::StructuredIdentifier`

Parameters:

doc (Nokogiri::HTML::Document)
xpt (String) (defaults to: '//dt[text()="标准号"]/following-sibling::dd[1]')

Returns:

(RelatonIsoBib::StructuredIdentifier)

# File 'lib/relaton_gb/scrapper.rb', line 49

def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.at xpt
  unless item_ref
    return RelatonIsoBib::StructuredIdentifier.new(
      project_number: "?", part_number: "?", prefix: nil, id: "?",
      type: "Chinese Standard"
    )
  end

  m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: item_ref.text, type: "Chinese Standard"
  )
end

#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `Object`

# File 'lib/relaton_gb/scrapper.rb', line 66

def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  gb_en = GbAgencies::Agencies.new("en", {}, "")
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
  name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc)
  entity = RelatonBib::Organization.new name: [
    { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
    { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
  ]
  [{ entity: entity, role: [type: "publisher"] }]
end

#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `Array<RelatonBib::DocumentIdentifier>`

Parameters:

doc (Nokogiri::HTML::Document)
xpt (String) (defaults to: '//dt[text()="标准号"]/following-sibling::dd[1]')

Returns:

(Array<RelatonBib::DocumentIdentifier>)

# File 'lib/relaton_gb/scrapper.rb', line 39

def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.at xpt
  return [] unless item_ref

  [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")]
end

#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ `RelatonBib::DocumentStatus`

Parameters:

doc (Nokogiri::HTML::Document)
xpt (String) (defaults to: ".s-status.label:nth-child(3)")

Returns:

(RelatonBib::DocumentStatus)

# File 'lib/relaton_gb/scrapper.rb', line 102

def get_status(doc, xpt = ".s-status.label:nth-child(3)")
  case doc.at(xpt).text.gsub(/\s/, "")
  when "即将实施"
    stage = "published"
  when "现行"
    stage = "activated"
  when "废止"
    stage = "obsoleted"
  end
  RelatonBib::DocumentStatus.new stage: stage
end

#get_titles(doc) ⇒ `Array<Hash>`

Returns * :title_intro [String]

:title_main [String]
:language [String]
:script [String].

Parameters:

doc (Nokogiri::HTML::Document)

Returns:

(Array<Hash>) —
- :title_intro [String]
- :title_main [String]
- :language [String]
- :script [String]

# File 'lib/relaton_gb/scrapper.rb', line 85

def get_titles(doc)
  titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil,
              language: "zh", script: "Hans" }]
  title_main = doc.css("div.page-header h5").text
  unless title_main.empty?
    titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
  end
  titles
end

#get_type(_doc) ⇒ `Object`



95
96
97

# File 'lib/relaton_gb/scrapper.rb', line 95

def get_type(_doc)
  "international-standard"
end

#scrapped_data(doc, src:) ⇒ `Hash`

rubocop:disable Metrics/MethodLength

Parameters:

doc (Nokogiri::HTML::Document)
src (String) —

url of scrapped page

Returns:

(Hash)

# File 'lib/relaton_gb/scrapper.rb', line 16

def scrapped_data(doc, src:)
  {
    committee: get_committee(doc),
    docid: get_docid(doc),
    title: get_titles(doc),
    contributor: get_contributors(doc),
    type: get_type(doc),
    docstatus: get_status(doc),
    gbtype: get_gbtype(doc),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(doc),
  }
end

Module: RelatonGb::Scrapper

Overview

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ RelatonIsoBib::StructuredIdentifier

#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Object

#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Array<RelatonBib::DocumentIdentifier>

#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ RelatonBib::DocumentStatus

#get_titles(doc) ⇒ Array<Hash>

#get_type(_doc) ⇒ Object

#scrapped_data(doc, src:) ⇒ Hash

#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `RelatonIsoBib::StructuredIdentifier`

#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `Object`

#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ `Array<RelatonBib::DocumentIdentifier>`

#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ `RelatonBib::DocumentStatus`

#get_titles(doc) ⇒ `Array<Hash>`

#get_type(_doc) ⇒ `Object`

#scrapped_data(doc, src:) ⇒ `Hash`