Module: RelatonGb::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/relaton_gb/scrapper.rb
Overview
Common scrapping methods.
Instance Method Summary collapse
- #fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ RelatonIsoBib::StructuredIdentifier
- #get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Object
- #get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Array<RelatonBib::DocumentIdentifier>
- #get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ RelatonBib::DocumentStatus
-
#get_titles(doc) ⇒ Array<Hash>
-
:title_intro [String] * :title_main [String] * :language [String] * :script [String].
-
- #get_type(_doc) ⇒ Object
-
#scrapped_data(doc, src:) ⇒ Hash
rubocop:disable Metrics/MethodLength.
Instance Method Details
#fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ RelatonIsoBib::StructuredIdentifier
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/relaton_gb/scrapper.rb', line 49 def fetch_structuredidentifier(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') item_ref = doc.at xpt unless item_ref return RelatonIsoBib::StructuredIdentifier.new( project_number: "?", part_number: "?", prefix: nil, id: "?", type: "Chinese Standard" ) end m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/) # prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part_number: m[2], prefix: nil, id: item_ref.text, type: "Chinese Standard" ) end |
#get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/relaton_gb/scrapper.rb', line 66 def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') gb_en = GbAgencies::Agencies.new("en", {}, "") gb_zh = GbAgencies::Agencies.new("zh", {}, "") name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc) entity = RelatonBib::Organization.new name: [ { language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) }, { language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) }, ] [{ entity: entity, role: [type: "publisher"] }] end |
#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Array<RelatonBib::DocumentIdentifier>
39 40 41 42 43 44 |
# File 'lib/relaton_gb/scrapper.rb', line 39 def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') item_ref = doc.at xpt return [] unless item_ref [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "Chinese Standard")] end |
#get_status(doc, xpt = ".s-status.label:nth-child(3)") ⇒ RelatonBib::DocumentStatus
102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/relaton_gb/scrapper.rb', line 102 def get_status(doc, xpt = ".s-status.label:nth-child(3)") case doc.at(xpt).text.gsub(/\s/, "") when "即将实施" stage = "published" when "现行" stage = "activated" when "废止" stage = "obsoleted" end RelatonBib::DocumentStatus.new stage: stage end |
#get_titles(doc) ⇒ Array<Hash>
Returns * :title_intro [String]
-
:title_main [String]
-
:language [String]
-
:script [String].
85 86 87 88 89 90 91 92 93 |
# File 'lib/relaton_gb/scrapper.rb', line 85 def get_titles(doc) titles = [{ title_main: doc.css("div.page-header h4").text, title_intro: nil, language: "zh", script: "Hans" }] title_main = doc.css("div.page-header h5").text unless title_main.empty? titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" } end titles end |
#get_type(_doc) ⇒ Object
95 96 97 |
# File 'lib/relaton_gb/scrapper.rb', line 95 def get_type(_doc) "international-standard" end |
#scrapped_data(doc, src:) ⇒ Hash
rubocop:disable Metrics/MethodLength
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/relaton_gb/scrapper.rb', line 16 def scrapped_data(doc, src:) { committee: get_committee(doc), docid: get_docid(doc), title: get_titles(doc), contributor: get_contributors(doc), type: get_type(doc), docstatus: get_status(doc), gbtype: get_gbtype(doc), ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(doc), } end |