Module: Telegrama::Formatter

Defined in:
lib/telegrama/formatter.rb

Defined Under Namespace

Classes: MarkdownError, MarkdownTokenizer

Constant Summary collapse

MARKDOWN_SPECIAL_CHARS =

Characters that need special escaping in Telegram’s MarkdownV2 format

%w[_ * [ ] ( ) ~ ` > # + - = | { } . !].freeze
ALWAYS_ESCAPE_CHARS =

Characters that should always be escaped in Telegram messages, even when Markdown is enabled

%w[. !].freeze
MARKDOWN_FORMAT_CHARS =

Characters used for Markdown formatting that need special handling

%w[* _].freeze

Class Method Summary collapse

Class Method Details

.apply_prefix_suffix(text) ⇒ String

Apply configured prefix and suffix to the message



60
61
62
63
64
65
66
67
68
69
# File 'lib/telegrama/formatter.rb', line 60

def self.apply_prefix_suffix(text)
  prefix = Telegrama.configuration.message_prefix
  suffix = Telegrama.configuration.message_suffix

  result = text.dup
  result = "#{prefix}#{result}" if prefix
  result = "#{result}#{suffix}" if suffix

  result
end

.escape_html(text) ⇒ String

Escape HTML special characters



526
527
528
529
530
531
# File 'lib/telegrama/formatter.rb', line 526

def self.escape_html(text)
  # Precompile HTML escape regex for better performance
  @@html_regex ||= /[<>&]/

  text.gsub(@@html_regex, '<' => '&lt;', '>' => '&gt;', '&' => '&amp;')
end

.escape_markdown_aggressive(text) ⇒ String

Fall back to an aggressive approach that escapes everything



456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
# File 'lib/telegrama/formatter.rb', line 456

def self.escape_markdown_aggressive(text)
  # Escape all special characters indiscriminately
  # This might break formatting but will at least deliver
  result = text.dup

  # Escape backslashes first
  result.gsub!('\\', '\\\\')

  # Then escape all other special characters
  MARKDOWN_SPECIAL_CHARS.each do |char|
    result.gsub!(char, "\\#{char}")
  end

  result
end

.escape_markdown_v2(text) ⇒ String

The main entry point for MarkdownV2 escaping



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/telegrama/formatter.rb', line 74

def self.escape_markdown_v2(text)
  return text if text.nil? || text.empty?

  # Special handling for messages with suffix like "Sent via Telegrama"
  if text.include?("\n--\nSent via Telegrama")
    # For messages with the standard suffix, we need to keep the dashes unchanged
    parts = text.split("\n--\n")
    if parts.length == 2
      first_part = tokenize_and_format(parts.first)
      return "#{first_part}\n--\n#{parts.last}"
    end
  end

  # For all other text, use the tokenizing approach
  tokenize_and_format(text)
end

.format(text, options = {}) ⇒ String

Main formatting entry point - processes text according to configuration and options



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/telegrama/formatter.rb', line 17

def self.format(text, options = {})
  # Merge defaults with any runtime overrides
  defaults = Telegrama.configuration.formatting_options || {}
  opts = defaults.merge(options)

  text = text.to_s

  # Apply prefix and suffix if configured
  text = apply_prefix_suffix(text)

  # Apply HTML escaping first (always safe to do)
  text = escape_html(text) if opts[:escape_html]

  # Apply email obfuscation BEFORE markdown escaping to prevent double-escaping
  text = obfuscate_emails(text) if opts[:obfuscate_emails]

  # Handle Markdown escaping
  if opts[:escape_markdown]
    begin
      text = escape_markdown_v2(text)
    rescue MarkdownError => e
      # Log the error but continue with plain text
      begin
        Telegrama.log_error("Markdown formatting failed: #{e.message}. Falling back to plain text.")
      rescue => _log_error
        # Ignore logging errors in tests
      end
      # Strip all markdown syntax to ensure plain text renders
      text = strip_markdown(text)
      # Force parse_mode to nil in the parent context
      Thread.current[:telegrama_parse_mode_override] = nil
    end
  end

  # Apply truncation last
  text = truncate(text, opts[:truncate]) if opts[:truncate]

  text
end

.html_to_telegram_markdown(html) ⇒ String

Convert HTML to Telegram MarkdownV2 format



483
484
485
486
487
488
489
490
491
492
493
494
# File 'lib/telegrama/formatter.rb', line 483

def self.html_to_telegram_markdown(html)
  # Convert HTML back to Telegram MarkdownV2 format
  # This is a simplified implementation - a real one would be more complex
  text = html.gsub(/<\/?p>/, "\n")
        .gsub(/<strong>(.*?)<\/strong>/, "*\\1*")
        .gsub(/<em>(.*?)<\/em>/, "_\\1_")
        .gsub(/<code>(.*?)<\/code>/, "`\\1`")
        .gsub(/<a href="(.*?)">(.*?)<\/a>/, "[\\2](\\1)")

  # Escape special characters outside of formatting tags
  escape_markdown_v2(text)
end

.obfuscate_emails(text) ⇒ String

Obfuscate email addresses in text



499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# File 'lib/telegrama/formatter.rb', line 499

def self.obfuscate_emails(text)
  # Precompile the email regex for better performance
  @@email_regex ||= /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/

  # Extract emails, obfuscate them, and insert them back
  emails = []
  text = text.gsub(@@email_regex) do |email|
    emails << email
    "TELEGRAMA_EMAIL_PLACEHOLDER_#{emails.length - 1}"
  end

  # Replace placeholders with obfuscated emails
  emails.each_with_index do |email, index|
    local, domain = email.split('@')
    obfuscated_local = local.length > 4 ? "#{local[0..2]}...#{local[-1]}" : "#{local[0]}..."
    obfuscated_email = "#{obfuscated_local}@#{domain}"

    # Replace the placeholder with the obfuscated email, ensuring no escapes in the domain
    text = text.gsub("TELEGRAMA_EMAIL_PLACEHOLDER_#{index}", obfuscated_email)
  end

  text
end

.strip_markdown(text) ⇒ String

Strip all markdown formatting for plain text delivery



475
476
477
478
# File 'lib/telegrama/formatter.rb', line 475

def self.strip_markdown(text)
  # Remove all markdown syntax for plain text delivery
  text.gsub(/[*_~`]|\[.*?\]\(.*?\)/, '')
end

.tokenize_and_format(text) ⇒ String

Tokenize and format the text using a state machine approach



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/telegrama/formatter.rb', line 94

def self.tokenize_and_format(text)
  # Special handling for links with the Markdown format [text](url)
  # Process only complete links to ensure incomplete links are handled by the state machine
  link_fixed_text = text.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
    # Extract link text and URL
    text_part = $1
    url_part = $2

    # Handle escaping within link text
    text_part = text_part.gsub(/([_*\[\]()~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Escape special characters in URL (except parentheses which define URL boundaries)
    url_part = url_part.gsub(/([_*\[\]~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Rebuild the link with proper escaping
    "[#{text_part}](#{url_part})"
  end

  # Process the text with fixed links using tokenizer
  tokenizer = MarkdownTokenizer.new(link_fixed_text)
  tokenizer.process
end

.truncate(text, max_length) ⇒ String

Truncate text to a maximum length



537
538
539
540
# File 'lib/telegrama/formatter.rb', line 537

def self.truncate(text, max_length)
  return text if !max_length || text.length <= max_length
  text[0, max_length]
end