Module: RubyCrawl::MarkdownConverter

Defined in:
lib/rubycrawl/markdown_converter.rb

Overview

Converts HTML to Markdown using reverse_markdown gem.

Constant Summary collapse

MARKDOWN_URL_PATTERNS =

Patterns for relative URLs in markdown

[
  %r{(!\[[^\]]*\])\((/[^)]+)\)}, # ![alt](/path)
  %r{(\[[^\]]*\])\((/[^)]+)\)}   # [text](/path)
].freeze

Class Method Summary collapse

Class Method Details

.convert(html, base_url: nil, **options) ⇒ String

Convert HTML to Markdown with resolved URLs.

Parameters:

  • html (String)

    The HTML content to convert

  • base_url (String, nil) (defaults to: nil)

    Base URL to resolve relative URLs

  • options (Hash)

    Options for conversion

Returns:

  • (String)

    The Markdown content with absolute URLs



22
23
24
25
26
27
28
29
30
31
# File 'lib/rubycrawl/markdown_converter.rb', line 22

def convert(html, base_url: nil, **options)
  return '' if html.nil? || html.empty?

  require_reverse_markdown
  markdown = ReverseMarkdown.convert(html, default_options.merge(options))
  base_url ? resolve_relative_urls(markdown, base_url) : markdown
rescue LoadError
  warn '[rubycrawl] reverse_markdown gem not installed. Add it to your Gemfile for markdown support.'
  ''
end

.default_optionsObject



56
57
58
59
60
61
62
# File 'lib/rubycrawl/markdown_converter.rb', line 56

def default_options
  {
    unknown_tags: :bypass,
    github_flavored: true,
    tag_border: ''
  }
end

.require_reverse_markdownObject



52
53
54
# File 'lib/rubycrawl/markdown_converter.rb', line 52

def require_reverse_markdown
  require 'reverse_markdown'
end

.resolve_relative_urls(markdown, base_url) ⇒ String

Resolve relative URLs in markdown to absolute URLs.

Parameters:

  • markdown (String)

    The markdown content

  • base_url (String)

    The base URL to resolve against

Returns:

  • (String)

    Markdown with absolute URLs



38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/rubycrawl/markdown_converter.rb', line 38

def resolve_relative_urls(markdown, base_url)
  return markdown unless base_url

  base_uri = URI.parse(base_url)
  origin = "#{base_uri.scheme}://#{base_uri.host}"
  origin += ":#{base_uri.port}" unless [80, 443].include?(base_uri.port)

  MARKDOWN_URL_PATTERNS.reduce(markdown) do |md, pattern|
    md.gsub(pattern) { "#{::Regexp.last_match(1)}(#{origin}#{::Regexp.last_match(2)})" }
  end
rescue URI::InvalidURIError
  markdown
end