Class: Ietf::Data::Importer::Scrapers::IrtfScraper

Inherits:

BaseScraper

Object
BaseScraper
Ietf::Data::Importer::Scrapers::IrtfScraper

show all

Defined in:: lib/ietf/data/importer/scrapers/irtf_scraper.rb

Overview

Scraper for IRTF groups from irtf.org

Constant Summary collapse

BASE_URL = Base URL for IRTF website

"https://www.irtf.org/groups.html"

Instance Method Summary collapse

#extract_from_dropdown(doc) ⇒ Array<Ietf::Data::Importer::Group>

Extract groups from the dropdown menu.
#fetch ⇒ Array<Ietf::Data::Importer::Group>

Fetch all IRTF groups.

Methods inherited from BaseScraper

#fetch_html, #log

Instance Method Details

#extract_from_dropdown(doc) ⇒ `Array<Ietf::Data::Importer::Group>`

Extract groups from the dropdown menu

# File 'lib/ietf/data/importer/scrapers/irtf_scraper.rb', line 87

def extract_from_dropdown(doc)
  groups = []

  # Look for the dropdown menu containing research groups
  dropdown = doc.css('a.dropdown-toggle').find do |el|
    el.text.include?('Research Groups')
  end

  return [] unless dropdown

  # Find the dropdown menu
  dropdown_parent = dropdown.parent
  dropdown_menu = dropdown_parent.css('.dropdown-menu')
  return [] unless dropdown_menu.any?

  log "Found dropdown menu with research groups", 1

  # Extract groups from the dropdown menu
  dropdown_menu.css('a.dropdown-item').each do |link|
    next unless link && link['href']

    name = link.text.strip
    href = link['href']

    # Extract abbreviation from href (e.g., cfrg.html -> CFRG)
    if href =~ /(\w+)\.html$/
      abbreviation = $1.upcase
    else
      next # Skip if we can't determine abbreviation
    end

    # Construct full URL if it's a relative path
    details_url = href
    if !details_url.start_with?('http')
      if details_url.start_with?('/')
        details_url = "https://www.irtf.org#{details_url}"
      else
        details_url = "https://www.irtf.org/#{details_url}"
      end
    end

    begin
      details = fetch_group_details(details_url)

      group = Importer::Group.new(
        abbreviation: abbreviation,
        name: name,
        organization: 'irtf',
        type: 'rg',
        area: nil,
        status: 'active', # Assume active since it's in the menu
        description: nil, # Will be populated from details page if available
        chairs: details[:chairs],
        mailing_list: details[:mailing_list],
        mailing_list_archive: details[:mailing_list_archive],
        website_url: details_url,
        charter_url: details[:charter_url],
        concluded_date: details[:concluded_date]
      )

      groups << group
    rescue => e
      log "Error fetching details for #{abbreviation} (#{details_url}): #{e.message}", 2
    end
  end

  groups
end

#fetch ⇒ `Array<Ietf::Data::Importer::Group>`

Fetch all IRTF groups

# File 'lib/ietf/data/importer/scrapers/irtf_scraper.rb', line 17

def fetch
  groups = []
  log "Fetching IRTF groups..."

  begin
    doc = fetch_html(BASE_URL)
    return [] unless doc

    # First try to extract from the dropdown menu
    dropdown_groups = extract_from_dropdown(doc)
    if dropdown_groups.any?
      log "Found #{dropdown_groups.size} groups in dropdown menu", 1
      groups.concat(dropdown_groups)
      return groups
    end

    # If dropdown extraction fails, fall back to traditional section-based extraction
    # Debug the page structure
    headings = doc.css('h3').map(&:text).join(', ')
    log "Found headings on IRTF page: #{headings}", 1

    # Extract active groups
    active_groups = extract_groups(doc, 'Active Research Groups', 'active')
    log "Found #{active_groups.size} active IRTF groups", 1

    # Extract concluded groups
    concluded_groups = extract_groups(doc, 'Concluded Research Groups', 'concluded')
    log "Found #{concluded_groups.size} concluded IRTF groups", 1

    groups.concat(active_groups)
    groups.concat(concluded_groups)

    # If still no groups found, try alternative selectors
    if groups.empty?
      log "No groups found with standard selectors, trying alternatives...", 1

      # Try different section titles
      ['Current Research Groups', 'Research Groups', 'IRTF Groups'].each do |title|
        section_groups = extract_groups(doc, title, 'active')
        if section_groups.any?
          log "Found #{section_groups.size} groups with section title: #{title}", 1
          groups.concat(section_groups)
        end
      end

      # Try a more generic approach if still no groups
      if groups.empty?
        log "Using generic list item selector...", 1
        # Find any unordered list with links
        doc.css('ul').each do |list|
          if list.css('li a').any?
            generic_groups = extract_groups_from_list(list, 'active')
            if generic_groups.any?
              log "Found #{generic_groups.size} groups using generic list selector", 1
              groups.concat(generic_groups)
            end
          end
        end
      end
    end
  rescue => e
    log "Error fetching IRTF groups: #{e.message}", 1
  end

  groups
end

Class: Ietf::Data::Importer::Scrapers::IrtfScraper

Overview

Constant Summary collapse

Instance Method Summary collapse

Methods inherited from BaseScraper

Instance Method Details

#extract_from_dropdown(doc) ⇒ Array<Ietf::Data::Importer::Group>

#fetch ⇒ Array<Ietf::Data::Importer::Group>

#extract_from_dropdown(doc) ⇒ `Array<Ietf::Data::Importer::Group>`

#fetch ⇒ `Array<Ietf::Data::Importer::Group>`