Module: RelatonGb::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/relaton_gb/scrapper.rb
Overview
Common scrapping methods.
Instance Method Summary collapse
- #fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
- #get_contributors(doc, docref) ⇒ Array<Hash>
- #get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
- #get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
-
#get_titles(doc) ⇒ Array<Hash>
-
:title_intro [String] * :title_main [String] * :language [String] * :script [String].
-
- #get_type ⇒ Object
-
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength.
Instance Method Details
#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
46 47 48 49 50 51 52 |
# File 'lib/relaton_gb/scrapper.rb', line 46 def fetch_structuredidentifier(docref) m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/) RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part_number: m[2], prefix: nil, id: docref, type: "Chinese Standard" ) end |
#get_contributors(doc, docref) ⇒ Array<Hash>
57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/relaton_gb/scrapper.rb', line 57 def get_contributors(doc, docref) gb_en = GbAgencies::Agencies.new("en", {}, "") gb_zh = GbAgencies::Agencies.new("zh", {}, "") name = docref.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc, docref) entity = RelatonBib::Organization.new name: [ { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) }, { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) }, ] [{ entity: entity, role: [type: "publisher"] }] end |
#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
40 41 42 |
# File 'lib/relaton_gb/scrapper.rb', line 40 def get_docid(docref) [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")] end |
#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
93 94 95 96 97 98 99 100 |
# File 'lib/relaton_gb/scrapper.rb', line 93 def get_status(doc, status = nil) stage = case status || doc.at("//td[contains(., '标准状态')]/span")&.text when "即将实施" then "published" when "现行" then "activated" when "废止" then "obsoleted" end RelatonBib::DocumentStatus.new stage: stage end |
#get_titles(doc) ⇒ Array<Hash>
Returns * :title_intro [String]
-
:title_main [String]
-
:language [String]
-
:script [String].
76 77 78 79 80 81 82 83 84 |
# File 'lib/relaton_gb/scrapper.rb', line 76 def get_titles(doc) titles = [{ title_main: doc.at("//td[contains(text(), '中文标准名称')]/b").text, title_intro: nil, language: "zh", script: "Hans" }] title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s unless title_main.empty? titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" } end titles end |
#get_type ⇒ Object
86 87 88 |
# File 'lib/relaton_gb/scrapper.rb', line 86 def get_type "standard" end |
#scrapped_data(doc, src, hit) ⇒ Hash
rubocop:disable Metrics/MethodLength
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/relaton_gb/scrapper.rb', line 17 def scrapped_data(doc, src, hit) { fetched: Date.today.to_s, committee: get_committee(doc, hit.docref), docid: get_docid(hit.docref), title: get_titles(doc), contributor: get_contributors(doc, hit.docref), doctype: get_type, docstatus: get_status(doc, hit.status), gbtype: get_gbtype(doc, hit.docref), ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(hit.docref), } end |