Module: Telegrama::Formatter

Defined in:
lib/telegrama/formatter.rb

Defined Under Namespace

Classes: MarkdownError, MarkdownTokenizer

Constant Summary collapse

MARKDOWN_SPECIAL_CHARS =

Characters that need special escaping in Telegram’s MarkdownV2 format

%w[_ * [ ] ( ) ~ ` > # + - = | { } . !].freeze
ALWAYS_ESCAPE_CHARS =

Characters that should always be escaped in Telegram messages, even when Markdown is enabled

%w[. !].freeze
MARKDOWN_FORMAT_CHARS =

Characters used for Markdown formatting that need special handling

%w[* _].freeze

Class Method Summary collapse

Class Method Details

.apply_prefix_suffix(text) ⇒ String

Apply configured prefix and suffix to the message

Parameters:

  • text (String)

    The original text

Returns:

  • (String)

    Text with prefix and suffix applied



60
61
62
63
64
65
66
67
68
69
# File 'lib/telegrama/formatter.rb', line 60

def self.apply_prefix_suffix(text)
  prefix = Telegrama.configuration.message_prefix
  suffix = Telegrama.configuration.message_suffix

  result = text.dup
  result = "#{prefix}#{result}" if prefix
  result = "#{result}#{suffix}" if suffix

  result
end

.escape_html(text) ⇒ String

Escape HTML special characters

Parameters:

  • text (String)

    The text with HTML characters

Returns:

  • (String)

    The text with HTML characters escaped



588
589
590
591
592
593
# File 'lib/telegrama/formatter.rb', line 588

def self.escape_html(text)
  # Precompile HTML escape regex for better performance
  @@html_regex ||= /[<>&]/

  text.gsub(@@html_regex, '<' => '&lt;', '>' => '&gt;', '&' => '&amp;')
end

.escape_markdown_aggressive(text) ⇒ String

Fall back to an aggressive approach that escapes everything

Parameters:

  • text (String)

    The text to escape

Returns:

  • (String)

    The aggressively escaped text



492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
# File 'lib/telegrama/formatter.rb', line 492

def self.escape_markdown_aggressive(text)
  # Escape all special characters indiscriminately
  # This might break formatting but will at least deliver
  result = text.dup

  # Escape backslashes first
  result.gsub!('\\', '\\\\')

  # Then escape all other special characters
  MARKDOWN_SPECIAL_CHARS.each do |char|
    result.gsub!(char, "\\#{char}")
  end

  result
end

.escape_markdown_v2(text) ⇒ String

The main entry point for MarkdownV2 escaping

Parameters:

  • text (String)

    The text to escape for MarkdownV2 format

Returns:

  • (String)

    The escaped text



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/telegrama/formatter.rb', line 74

def self.escape_markdown_v2(text)
  return text if text.nil? || text.empty?

  # Special handling for messages with suffix like "Sent via Telegrama"
  if text.include?("\n--\nSent via Telegrama")
    # For messages with the standard suffix, we need to keep the dashes unchanged
    parts = text.split("\n--\n")
    if parts.length == 2
      first_part = tokenize_and_format(parts.first)
      return "#{first_part}\n--\n#{parts.last}"
    end
  end

  # For all other text, use the tokenizing approach
  tokenize_and_format(text)
end

.format(text, options = {}) ⇒ String

Main formatting entry point - processes text according to configuration and options

Parameters:

  • text (String)

    The text to format

  • options (Hash) (defaults to: {})

    Formatting options to override configuration defaults

Returns:

  • (String)

    The formatted text



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/telegrama/formatter.rb', line 17

def self.format(text, options = {})
  # Merge defaults with any runtime overrides
  defaults = Telegrama.configuration.formatting_options || {}
  opts = defaults.merge(options)

  text = text.to_s

  # Apply prefix and suffix if configured
  text = apply_prefix_suffix(text)

  # Apply HTML escaping first (always safe to do)
  text = escape_html(text) if opts[:escape_html]

  # Apply email obfuscation BEFORE markdown escaping to prevent double-escaping
  text = obfuscate_emails(text) if opts[:obfuscate_emails]

  # Handle Markdown escaping
  if opts[:escape_markdown]
    begin
      text = escape_markdown_v2(text)
    rescue MarkdownError => e
      # Log the error but continue with plain text
      begin
        Telegrama.log_error("Markdown formatting failed: #{e.message}. Falling back to plain text.")
      rescue => _log_error
        # Ignore logging errors in tests
      end
      # Strip all markdown syntax to ensure plain text renders
      text = strip_markdown(text)
      # Force parse_mode to nil in the parent context
      Thread.current[:telegrama_parse_mode_override] = nil
    end
  end

  # Apply truncation last
  text = truncate(text, opts[:truncate]) if opts[:truncate]

  text
end

.html_to_telegram_markdown(html) ⇒ String

Convert HTML to Telegram MarkdownV2 format

Parameters:

  • html (String)

    The HTML text

Returns:

  • (String)

    The text converted to MarkdownV2 format



545
546
547
548
549
550
551
552
553
554
555
556
# File 'lib/telegrama/formatter.rb', line 545

def self.html_to_telegram_markdown(html)
  # Convert HTML back to Telegram MarkdownV2 format
  # This is a simplified implementation - a real one would be more complex
  text = html.gsub(/<\/?p>/, "\n")
        .gsub(/<strong>(.*?)<\/strong>/, "*\\1*")
        .gsub(/<em>(.*?)<\/em>/, "_\\1_")
        .gsub(/<code>(.*?)<\/code>/, "`\\1`")
        .gsub(/<a href="(.*?)">(.*?)<\/a>/, "[\\2](\\1)")

  # Escape special characters outside of formatting tags
  escape_markdown_v2(text)
end

.obfuscate_emails(text) ⇒ String

Obfuscate email addresses in text

Parameters:

  • text (String)

    The text containing email addresses

Returns:

  • (String)

    The text with obfuscated email addresses



561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
# File 'lib/telegrama/formatter.rb', line 561

def self.obfuscate_emails(text)
  # Precompile the email regex for better performance
  @@email_regex ||= /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/

  # Extract emails, obfuscate them, and insert them back
  emails = []
  text = text.gsub(@@email_regex) do |email|
    emails << email
    "TELEGRAMA_EMAIL_PLACEHOLDER_#{emails.length - 1}"
  end

  # Replace placeholders with obfuscated emails
  emails.each_with_index do |email, index|
    local, domain = email.split('@')
    obfuscated_local = local.length > 4 ? "#{local[0..2]}...#{local[-1]}" : "#{local[0]}..."
    obfuscated_email = "#{obfuscated_local}@#{domain}"

    # Replace the placeholder with the obfuscated email, ensuring no escapes in the domain
    text = text.gsub("TELEGRAMA_EMAIL_PLACEHOLDER_#{index}", obfuscated_email)
  end

  text
end

.strip_markdown(text) ⇒ String

Strip all markdown formatting for plain text delivery

Parameters:

  • text (String)

    The text with markdown formatting

Returns:

  • (String)

    The text with markdown formatting removed



511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
# File 'lib/telegrama/formatter.rb', line 511

def self.strip_markdown(text)
  result = text.dup

  # Remove markdown links [text](url) -> text
  result.gsub!(/\[([^\]]*)\]\([^)]*\)/, '\1')

  # Remove triple backtick code blocks (preserve content)
  result.gsub!(/```[a-z]*\n?(.*?)```/m, '\1')

  # Remove inline code backticks (preserve content)
  result.gsub!(/`([^`]*)`/, '\1')

  # Remove bold formatting (both ** and *)
  result.gsub!(/\*\*([^*]*)\*\*/, '\1')
  result.gsub!(/\*([^*]*)\*/, '\1')

  # Remove italic formatting (both __ and _)
  result.gsub!(/__([^_]*)__/, '\1')
  result.gsub!(/(?<![\\])_([^_]*)_/, '\1')

  # Remove strikethrough
  result.gsub!(/~~([^~]*)~~/, '\1')
  result.gsub!(/~([^~]*)~/, '\1')

  # Remove any remaining unmatched formatting characters at word boundaries
  # but preserve them in the middle of words (like file_name)
  result.gsub!(/(?<=\s)[*_~`]+|[*_~`]+(?=\s|$)/, '')

  result
end

.tokenize_and_format(text) ⇒ String

Tokenize and format the text using a state machine approach

Parameters:

  • text (String)

    The text to process

Returns:

  • (String)

    The processed text



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/telegrama/formatter.rb', line 94

def self.tokenize_and_format(text)
  # Special handling for links with the Markdown format [text](url)
  # Process only complete links to ensure incomplete links are handled by the state machine
  link_fixed_text = text.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
    # Extract link text and URL
    text_part = $1
    url_part = $2

    # Handle escaping within link text
    text_part = text_part.gsub(/([_*\[\]()~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Escape special characters in URL (except parentheses which define URL boundaries)
    url_part = url_part.gsub(/([_*\[\]~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Rebuild the link with proper escaping
    "[#{text_part}](#{url_part})"
  end

  # Process the text with fixed links using tokenizer
  tokenizer = MarkdownTokenizer.new(link_fixed_text)
  tokenizer.process
end

.truncate(text, max_length) ⇒ String

Truncate text to a maximum length

Parameters:

  • text (String)

    The text to truncate

  • max_length (Integer, nil)

    The maximum length or nil for no truncation

Returns:

  • (String)

    The truncated text



599
600
601
602
# File 'lib/telegrama/formatter.rb', line 599

def self.truncate(text, max_length)
  return text if !max_length || text.length <= max_length
  text[0, max_length]
end