Module: Gbbib::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/gbbib/scrapper.rb
Overview
Common scrapping methods.
Instance Method Summary collapse
-
#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Hash
-
:project_number [String] * :part_number [String].
-
-
#get_status(doc, xpt = '.s-status.label:nth-child(3)') ⇒ Hash
-
:status [String] * :stage [String] * :substage [String].
-
-
#get_titles(doc) ⇒ Array<Hash>
-
:title_intro [String] * :title_main [String] * :language [String] * :script [String].
-
- #get_type(_doc) ⇒ Object
-
#scrapped_data(doc, src:) ⇒ Hash
rubocop:disable Metrics/MethodLength.
Instance Method Details
#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Hash
37 38 39 40 41 |
# File 'lib/gbbib/scrapper.rb', line 37 def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') item_ref = doc.xpath(xpt) .text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/) { project_number: item_ref[1], part_number: item_ref[2] } end |
#get_status(doc, xpt = '.s-status.label:nth-child(3)') ⇒ Hash
69 70 71 72 73 74 75 76 |
# File 'lib/gbbib/scrapper.rb', line 69 def get_status(doc, xpt = '.s-status.label:nth-child(3)') status = case doc.at(xpt).text.gsub(/\s/, '') when '即将实施' then 'published' when '现行' then 'activated' when '废止' then 'obsoleted' end { status: status, stage: '', substage: '' } end |
#get_titles(doc) ⇒ Array<Hash>
49 50 51 52 53 54 55 56 57 58 |
# File 'lib/gbbib/scrapper.rb', line 49 def get_titles(doc) titles = [{ title_intro: doc.css('div.page-header h4').text, title_main: '', language: 'zh', script: 'Hans' }] title_intro = doc.css('div.page-header h5').text unless title_intro.empty? titles << { title_intro: title_intro, title_main: '', language: 'en', script: 'Latn' } end titles end |
#get_type(_doc) ⇒ Object
60 61 62 |
# File 'lib/gbbib/scrapper.rb', line 60 def get_type(_doc) 'standard' end |
#scrapped_data(doc, src:) ⇒ Hash
rubocop:disable Metrics/MethodLength
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/gbbib/scrapper.rb', line 15 def scrapped_data(doc, src:) { committee: get_committee(doc), docid: get_docid(doc), titles: get_titles(doc), type: get_type(doc), docstatus: get_status(doc), gbtype: get_gbtype(doc), ccs: get_ccs(doc), ics: get_ics(doc), source: [{ type: 'src', content: src }], dates: get_dates(doc), language: ['zh'], script: ['Hans'] } end |