Module: RubyCrawl::Browser::Extraction

Defined in:: lib/rubycrawl/browser/extraction.rb

Overview

JavaScript extraction constants, evaluated inside Chromium via page.evaluate(). Ported verbatim from node/src/index.js — logic is unchanged. NOISE_SELECTORS is interpolated directly into EXTRACT_CONTENT_JS (no need to pass as a JS argument as the Node version did).

Constant Summary collapse

EXTRACT_METADATA_JS = All constants are IIFEs — Ferrum’s page.evaluate() evaluates an expression, it does NOT call function definitions. Wrapping as (() => { … })() ensures the function is immediately invoked and its return value is captured.

"(() => {\n  const getMeta = (name) => {\n    const meta = document.querySelector(`meta[name=\"${name}\"], meta[property=\"${name}\"]`);\n    return meta?.getAttribute(\"content\") || null;\n  };\n  const getLink = (rel) => {\n    const link = document.querySelector(`link[rel=\"${rel}\"]`);\n    return link?.getAttribute(\"href\") || null;\n  };\n  return {\n    title:               document.title || null,\n    description:         getMeta(\"description\") || getMeta(\"og:description\") || null,\n    keywords:            getMeta(\"keywords\"),\n    author:              getMeta(\"author\"),\n    og_title:            getMeta(\"og:title\"),\n    og_description:      getMeta(\"og:description\"),\n    og_image:            getMeta(\"og:image\"),\n    og_url:              getMeta(\"og:url\"),\n    og_type:             getMeta(\"og:type\"),\n    twitter_card:        getMeta(\"twitter:card\"),\n    twitter_title:       getMeta(\"twitter:title\"),\n    twitter_description: getMeta(\"twitter:description\"),\n    twitter_image:       getMeta(\"twitter:image\"),\n    canonical:           getLink(\"canonical\"),\n    lang:                document.documentElement.lang || null,\n    charset:             document.characterSet || null,\n  };\n})()\n"

EXTRACT_LINKS_JS =

"(() => Array.from(document.querySelectorAll(\"a[href]\")).map(link => ({\n  url:   link.href,\n  text:  (link.textContent || \"\").trim(),\n  title: link.getAttribute(\"title\") || null,\n  rel:   link.getAttribute(\"rel\")   || null,\n})))()\n"

EXTRACT_RAW_TEXT_JS =

"(() => (document.body?.innerText || \"\").trim())()\n"

NOISE_SELECTORS = Semantic noise selectors — covers standard HTML5 elements and ARIA roles. Interpolated directly into EXTRACT_CONTENT_JS as a string literal.

[
  'nav', 'header', 'footer', 'aside',
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
  '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
  '[role="alert"]', '[aria-hidden="true"]',
  'script', 'style', 'noscript', 'iframe'
].join(', ').freeze

EXTRACT_CONTENT_JS = Removes semantic noise (nav/header/footer/aside + ARIA roles) and high link-density containers, then returns both clean plain text and clean HTML. DOM mutations are reversed after extraction so the page is unchanged.

"(() => {\n  const noiseSelectors = \#{NOISE_SELECTORS.to_json};\n  function linkDensity(el) {\n    const total = (el.innerText || \"\").trim().length;\n    if (!total) return 1;\n    const linked = Array.from(el.querySelectorAll(\"a\"))\n      .reduce((sum, a) => sum + (a.innerText || \"\").trim().length, 0);\n    return linked / total;\n  }\n  const removed = [];\n  function stash(el) {\n    if (el.parentNode) {\n      removed.push({ el, parent: el.parentNode, next: el.nextSibling });\n      el.parentNode.removeChild(el);\n    }\n  }\n  document.body.querySelectorAll(noiseSelectors).forEach(stash);\n  const blockTags = new Set([\"script\", \"style\", \"noscript\", \"link\", \"meta\"]);\n  const topChildren = Array.from(document.body.children)\n    .filter(el => !blockTags.has(el.tagName.toLowerCase()));\n  const roots = topChildren.length === 1\n    ? [document.body, topChildren[0]] : [document.body];\n  for (const root of roots) {\n    for (const el of Array.from(root.children)) {\n      const text = (el.innerText || \"\").trim();\n      if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);\n    }\n  }\n  const cleanHtml = document.body.innerHTML;\n  removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));\n  return { cleanHtml };\n})()\n".freeze