Module: RelatonGb::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/relaton_gb/scrapper.rb
Overview
Common scrapping methods.
Instance Method Summary collapse
- #fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
- #get_contributors(doc, docref) ⇒ Array<Hash>
- #get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
- #get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
- #get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
- #get_type ⇒ Object
- #org(lang, name, gbtype) ⇒ Hash
-
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength.
Instance Method Details
#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
46 47 48 49 50 51 52 |
# File 'lib/relaton_gb/scrapper.rb', line 46 def fetch_structuredidentifier(docref) m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/) RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part_number: m[2], prefix: nil, id: docref, type: "Chinese Standard" ) end |
#get_contributors(doc, docref) ⇒ Array<Hash>
57 58 59 60 61 62 63 64 65 66 |
# File 'lib/relaton_gb/scrapper.rb', line 57 def get_contributors(doc, docref) name = docref.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc, docref) orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact return [] unless orgs.any? entity = RelatonBib::Organization.new name: orgs [{ entity: entity, role: [type: "publisher"] }] end |
#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
40 41 42 |
# File 'lib/relaton_gb/scrapper.rb', line 40 def get_docid(docref) [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")] end |
#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
98 99 100 101 102 103 104 105 |
# File 'lib/relaton_gb/scrapper.rb', line 98 def get_status(doc, status = nil) stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text when "即将实施" then "published" when "现行" then "activated" when "废止" then "obsoleted" end RelatonBib::DocumentStatus.new stage: stage end |
#get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
82 83 84 85 86 87 88 89 |
# File 'lib/relaton_gb/scrapper.rb', line 82 def get_titles(doc) tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans" ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s return titles if ten.empty? titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn") end |
#get_type ⇒ Object
91 92 93 |
# File 'lib/relaton_gb/scrapper.rb', line 91 def get_type "standard" end |
#org(lang, name, gbtype) ⇒ Hash
72 73 74 75 76 77 78 |
# File 'lib/relaton_gb/scrapper.rb', line 72 def org(lang, name, gbtype) ag = GbAgencies::Agencies.new(lang, {}, "") content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) return unless content { language: lang, content: content } end |
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/relaton_gb/scrapper.rb', line 17 def scrapped_data(doc, src, hit) { fetched: Date.today.to_s, committee: get_committee(doc, hit.docref), docid: get_docid(hit.docref), title: get_titles(doc), contributor: get_contributors(doc, hit.docref), doctype: get_type, docstatus: get_status(doc, hit.status), gbtype: get_gbtype(doc, hit.docref), ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(hit.docref), } end |