Module: RubyCrawl::Browser::Extraction
- Defined in:
- lib/rubycrawl/browser/extraction.rb
Overview
JavaScript extraction constants, evaluated inside Chromium via page.evaluate(). All constants are IIFEs — Ferrum’s page.evaluate() evaluates an expression, it does NOT call function definitions. Wrapping as (() => { … })() ensures the function is immediately invoked and its return value is captured.
Constant Summary collapse
- EXTRACT_METADATA_JS =
"(() => {\n const getMeta = (name) => {\n const meta = document.querySelector(`meta[name=\"${name}\"], meta[property=\"${name}\"]`);\n return meta?.getAttribute(\"content\") || null;\n };\n const getLink = (rel) => {\n const link = document.querySelector(`link[rel=\"${rel}\"]`);\n return link?.getAttribute(\"href\") || null;\n };\n return {\n title: document.title || null,\n description: getMeta(\"description\") || getMeta(\"og:description\") || null,\n keywords: getMeta(\"keywords\"),\n author: getMeta(\"author\"),\n og_title: getMeta(\"og:title\"),\n og_description: getMeta(\"og:description\"),\n og_image: getMeta(\"og:image\"),\n og_url: getMeta(\"og:url\"),\n og_type: getMeta(\"og:type\"),\n twitter_card: getMeta(\"twitter:card\"),\n twitter_title: getMeta(\"twitter:title\"),\n twitter_description: getMeta(\"twitter:description\"),\n twitter_image: getMeta(\"twitter:image\"),\n canonical: getLink(\"canonical\"),\n lang: document.documentElement.lang || null,\n charset: document.characterSet || null,\n };\n})()\n"- EXTRACT_LINKS_JS =
"(() => Array.from(document.querySelectorAll(\"a[href]\")).map(link => ({\n url: link.href,\n text: (link.textContent || \"\").trim(),\n title: link.getAttribute(\"title\") || null,\n rel: link.getAttribute(\"rel\") || null,\n})))()\n"- EXTRACT_RAW_TEXT_JS =
"(() => (document.body?.innerText || \"\").trim())()\n"- NOISE_SELECTORS =
Semantic noise selectors — used by the heuristic fallback.
[ 'nav', 'header', 'footer', 'aside', '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]', '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]', '[role="alert"]', '[aria-hidden="true"]', 'script', 'style', 'noscript', 'iframe' ].join(', ').freeze
- READABILITY_JS =
Mozilla Readability.js v0.6.0 — vendored source, read once at load time. Embedded inside EXTRACT_CONTENT_JS’s outer IIFE so Readability is defined and used within the same Runtime.evaluate expression (Ferrum evaluates a single expression — separate evaluate calls have separate scopes).
File.read(File.join(__dir__, 'readability.js')).freeze
- EXTRACT_CONTENT_JS =
Extracts clean article HTML using Mozilla Readability (primary) with a link-density heuristic as fallback when Readability returns no content. Everything is wrapped in one outer IIFE so page.evaluate gets a single expression and Readability is in scope for the extraction logic. DOM mutations from the fallback path are reversed after extraction.
"(() => {\n // Mozilla Readability.js v0.6.0 \u2014 defined in this IIFE's scope.\n \#{READABILITY_JS}\n\n // Primary: Mozilla Readability \u2014 article-quality extraction.\n let readabilityDebug = null;\n try {\n const docClone = document.cloneNode(true);\n const reader = new Readability(docClone, { charThreshold: 100 });\n const article = reader.parse();\n if (article && article.textContent && article.textContent.trim().length > 200) {\n return { cleanHtml: article.content, extractor: \"readability\" };\n }\n readabilityDebug = article ? `returned ${article.textContent?.trim().length ?? 0} text chars (below threshold)` : \"returned null (no article detected)\";\n } catch (e) {\n readabilityDebug = `error: ${e.message}`;\n }\n\n // Fallback: link-density heuristic (works on nav-heavy / non-article pages).\n const noiseSelectors = \#{NOISE_SELECTORS.to_json};\n function linkDensity(el) {\n const total = (el.innerText || \"\").trim().length;\n if (!total) return 1;\n const linked = Array.from(el.querySelectorAll(\"a\"))\n .reduce((sum, a) => sum + (a.innerText || \"\").trim().length, 0);\n return linked / total;\n }\n const removed = [];\n function stash(el) {\n if (el.parentNode) {\n removed.push({ el, parent: el.parentNode, next: el.nextSibling });\n el.parentNode.removeChild(el);\n }\n }\n document.body.querySelectorAll(noiseSelectors).forEach(stash);\n const blockTags = new Set([\"script\", \"style\", \"noscript\", \"link\", \"meta\"]);\n const topChildren = Array.from(document.body.children)\n .filter(el => !blockTags.has(el.tagName.toLowerCase()));\n const roots = topChildren.length === 1\n ? [document.body, topChildren[0]] : [document.body];\n for (const root of roots) {\n for (const el of Array.from(root.children)) {\n const text = (el.innerText || \"\").trim();\n if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);\n }\n }\n const cleanHtml = document.body.innerHTML;\n removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));\n return { cleanHtml, extractor: \"heuristic\", debug: readabilityDebug };\n})()\n".freeze