Module: Gbbib::Scrapper

Included in:
GbScrapper, SecScrapper, TScrapper
Defined in:
lib/gbbib/scrapper.rb

Overview

Common scrapping methods.

Instance Method Summary collapse

Instance Method Details

#get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') ⇒ Hash



37
38
39
40
41
# File 'lib/gbbib/scrapper.rb', line 37

def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.xpath(xpt)
                .text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
  { project_number: item_ref[1], part_number: item_ref[2] }
end

#get_status(doc, xpt = '.s-status.label:nth-child(3)') ⇒ Hash



69
70
71
72
73
74
75
76
# File 'lib/gbbib/scrapper.rb', line 69

def get_status(doc, xpt = '.s-status.label:nth-child(3)')
  status = case doc.at(xpt).text.gsub(/\s/, '')
           when '即将实施' then 'published'
           when '现行' then 'activated'
           when '废止' then 'obsoleted'
           end
  { status: status, stage: '', substage: '' }
end

#get_titles(doc) ⇒ Array<Hash>



49
50
51
52
53
54
55
56
57
58
# File 'lib/gbbib/scrapper.rb', line 49

def get_titles(doc)
  titles = [{ title_intro: doc.css('div.page-header h4').text,
              title_main: '', language: 'zh', script: 'Hans' }]
  title_intro = doc.css('div.page-header h5').text
  unless title_intro.empty?
    titles << { title_intro: title_intro, title_main: '', language: 'en',
                script: 'Latn' }
  end
  titles
end

#get_type(_doc) ⇒ Object



60
61
62
# File 'lib/gbbib/scrapper.rb', line 60

def get_type(_doc)
  'standard'
end

#scrapped_data(doc, src:) ⇒ Hash

rubocop:disable Metrics/MethodLength



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/gbbib/scrapper.rb', line 15

def scrapped_data(doc, src:)
  {
    committee: get_committee(doc),
    docid:     get_docid(doc),
    titles:    get_titles(doc),
    type:      get_type(doc),
    docstatus: get_status(doc),
    gbtype:    get_gbtype(doc),
    ccs:       get_ccs(doc),
    ics:       get_ics(doc),
    source:    [{ type: 'src', content: src }],
    dates:     get_dates(doc),
    language:  ['zh'],
    script:    ['Hans']
  }
end