Module: RelatonGb::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/relaton_gb/scrapper.rb
Overview
Common scrapping methods.
Constant Summary collapse
- STAGES =
{ "即将实施" => "published", "现行" => "activated", "废止" => "obsoleted", "被代替" => "replaced" }.freeze
Instance Method Summary collapse
- #fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
- #get_contributors(doc, docref) ⇒ Array<Hash>
- #get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
- #get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
- #get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
- #get_type ⇒ Object
- #org(lang, name, gbtype) ⇒ Hash
-
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength.
Instance Method Details
#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
51 52 53 54 55 56 57 |
# File 'lib/relaton_gb/scrapper.rb', line 51 def fetch_structuredidentifier(docref) m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/) RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part_number: m[2], prefix: nil, id: docref, type: "Chinese Standard" ) end |
#get_contributors(doc, docref) ⇒ Array<Hash>
62 63 64 65 66 67 68 69 70 71 |
# File 'lib/relaton_gb/scrapper.rb', line 62 def get_contributors(doc, docref) name = docref.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc, docref) orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact return [] unless orgs.any? entity = RelatonBib::Organization.new name: orgs [{ entity: entity, role: [type: "publisher"] }] end |
#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
45 46 47 |
# File 'lib/relaton_gb/scrapper.rb', line 45 def get_docid(docref) [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard", primary: true)] end |
#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
103 104 105 106 |
# File 'lib/relaton_gb/scrapper.rb', line 103 def get_status(doc, status = nil) status ||= doc.at("//td[contains(., '标准状态')]/span")&.text RelatonBib::DocumentStatus.new stage: STAGES[status] end |
#get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
87 88 89 90 91 92 93 94 |
# File 'lib/relaton_gb/scrapper.rb', line 87 def get_titles(doc) tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans" ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s return titles if ten.empty? titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn") end |
#get_type ⇒ Object
96 97 98 |
# File 'lib/relaton_gb/scrapper.rb', line 96 def get_type "standard" end |
#org(lang, name, gbtype) ⇒ Hash
77 78 79 80 81 82 83 |
# File 'lib/relaton_gb/scrapper.rb', line 77 def org(lang, name, gbtype) ag = GbAgencies::Agencies.new(lang, {}, "") content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) return unless content { language: lang, content: content } end |
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton_gb/scrapper.rb', line 22 def scrapped_data(doc, src, hit) { fetched: Date.today.to_s, committee: get_committee(doc, hit.docref), docid: get_docid(hit.docref), title: get_titles(doc), contributor: get_contributors(doc, hit.docref), doctype: get_type, docstatus: get_status(doc, hit.status), gbtype: get_gbtype(doc, hit.docref), ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(hit.docref), } end |