Module: RubyCrawl::Browser::Extraction

Defined in:: lib/rubycrawl/browser/extraction.rb

Overview

JavaScript extraction constants, evaluated inside Chromium via page.evaluate(). All constants are IIFEs — Ferrum’s page.evaluate() evaluates an expression, it does NOT call function definitions. Wrapping as (() => { … })() ensures the function is immediately invoked and its return value is captured.

Constant Summary collapse

EXTRACT_METADATA_JS =

"(() => {\n  const getMeta = (name) => {\n    const meta = document.querySelector(`meta[name=\"${name}\"], meta[property=\"${name}\"]`);\n    return meta?.getAttribute(\"content\") || null;\n  };\n  const getLink = (rel) => {\n    const link = document.querySelector(`link[rel=\"${rel}\"]`);\n    return link?.getAttribute(\"href\") || null;\n  };\n  return {\n    title:               document.title || null,\n    description:         getMeta(\"description\") || getMeta(\"og:description\") || null,\n    keywords:            getMeta(\"keywords\"),\n    author:              getMeta(\"author\"),\n    og_title:            getMeta(\"og:title\"),\n    og_description:      getMeta(\"og:description\"),\n    og_image:            getMeta(\"og:image\"),\n    og_url:              getMeta(\"og:url\"),\n    og_type:             getMeta(\"og:type\"),\n    twitter_card:        getMeta(\"twitter:card\"),\n    twitter_title:       getMeta(\"twitter:title\"),\n    twitter_description: getMeta(\"twitter:description\"),\n    twitter_image:       getMeta(\"twitter:image\"),\n    canonical:           getLink(\"canonical\"),\n    lang:                document.documentElement.lang || null,\n    charset:             document.characterSet || null,\n  };\n})()\n"

EXTRACT_LINKS_JS =

"(() => Array.from(document.querySelectorAll(\"a[href]\")).map(link => ({\n  url:   link.href,\n  text:  (link.textContent || \"\").trim(),\n  title: link.getAttribute(\"title\") || null,\n  rel:   link.getAttribute(\"rel\")   || null,\n})))()\n"

EXTRACT_RAW_TEXT_JS =

"(() => (document.body?.innerText || \"\").trim())()\n"

NOISE_SELECTORS = Semantic noise selectors — used by the heuristic fallback.

[
  'nav', 'header', 'footer', 'aside',
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
  '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
  '[role="alert"]', '[aria-hidden="true"]',
  'script', 'style', 'noscript', 'iframe'
].join(', ').freeze

READABILITY_JS = Mozilla Readability.js v0.6.0 — vendored source, read once at load time. Embedded inside EXTRACT_CONTENT_JS’s outer IIFE so Readability is defined and used within the same Runtime.evaluate expression (Ferrum evaluates a single expression — separate evaluate calls have separate scopes).

File.read(File.join(__dir__, 'readability.js')).freeze

EXTRACT_CONTENT_JS = Extracts clean article HTML using Mozilla Readability (primary) with a link-density heuristic as fallback when Readability returns no content. Everything is wrapped in one outer IIFE so page.evaluate gets a single expression and Readability is in scope for the extraction logic. DOM mutations from the fallback path are reversed after extraction.

"(() => {\n  // Mozilla Readability.js v0.6.0 \u2014 defined in this IIFE's scope.\n  \#{READABILITY_JS}\n\n  // Primary: Mozilla Readability \u2014 article-quality extraction.\n  let readabilityDebug = null;\n  try {\n    const docClone = document.cloneNode(true);\n    const reader = new Readability(docClone, { charThreshold: 100 });\n    const article = reader.parse();\n    if (article && article.textContent && article.textContent.trim().length > 200) {\n      return { cleanHtml: article.content, extractor: \"readability\" };\n    }\n    readabilityDebug = article ? `returned ${article.textContent?.trim().length ?? 0} text chars (below threshold)` : \"returned null (no article detected)\";\n  } catch (e) {\n    readabilityDebug = `error: ${e.message}`;\n  }\n\n  // Fallback: link-density heuristic (works on nav-heavy / non-article pages).\n  const noiseSelectors = \#{NOISE_SELECTORS.to_json};\n  function linkDensity(el) {\n    const total = (el.innerText || \"\").trim().length;\n    if (!total) return 1;\n    const linked = Array.from(el.querySelectorAll(\"a\"))\n      .reduce((sum, a) => sum + (a.innerText || \"\").trim().length, 0);\n    return linked / total;\n  }\n  const removed = [];\n  function stash(el) {\n    if (el.parentNode) {\n      removed.push({ el, parent: el.parentNode, next: el.nextSibling });\n      el.parentNode.removeChild(el);\n    }\n  }\n  document.body.querySelectorAll(noiseSelectors).forEach(stash);\n  const blockTags = new Set([\"script\", \"style\", \"noscript\", \"link\", \"meta\"]);\n  const topChildren = Array.from(document.body.children)\n    .filter(el => !blockTags.has(el.tagName.toLowerCase()));\n  const roots = topChildren.length === 1\n    ? [document.body, topChildren[0]] : [document.body];\n  for (const root of roots) {\n    for (const el of Array.from(root.children)) {\n      const text = (el.innerText || \"\").trim();\n      if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);\n    }\n  }\n  const cleanHtml = document.body.innerHTML;\n  removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));\n  return { cleanHtml, extractor: \"heuristic\", debug: readabilityDebug };\n})()\n".freeze