Module: Telegrama::Formatter

Defined in:: lib/telegrama/formatter.rb

Defined Under Namespace

Classes: MarkdownError, MarkdownTokenizer

Constant Summary collapse

MARKDOWN_SPECIAL_CHARS = Characters that need special escaping in Telegram’s MarkdownV2 format

%w[_ * [ ] ( ) ~ ` > # + - = | { } . !].freeze

ALWAYS_ESCAPE_CHARS = Characters that should always be escaped in Telegram messages, even when Markdown is enabled

%w[. !].freeze

MARKDOWN_FORMAT_CHARS = Characters used for Markdown formatting that need special handling

%w[* _].freeze

Class Method Summary collapse

.apply_prefix_suffix(text) ⇒ String

Apply configured prefix and suffix to the message.
.escape_html(text) ⇒ String

Escape HTML special characters.
.escape_markdown_aggressive(text) ⇒ String

Fall back to an aggressive approach that escapes everything.
.escape_markdown_v2(text) ⇒ String

The main entry point for MarkdownV2 escaping.
.format(text, options = {}) ⇒ String

Main formatting entry point - processes text according to configuration and options.
.html_to_telegram_markdown(html) ⇒ String

Convert HTML to Telegram MarkdownV2 format.
.obfuscate_emails(text) ⇒ String

Obfuscate email addresses in text.
.strip_markdown(text) ⇒ String

Strip all markdown formatting for plain text delivery.
.tokenize_and_format(text) ⇒ String

Tokenize and format the text using a state machine approach.
.truncate(text, max_length) ⇒ String

Truncate text to a maximum length.

Class Method Details

.apply_prefix_suffix(text) ⇒ `String`

Apply configured prefix and suffix to the message

Parameters:

text (String) —

The original text

Returns:

(String) —

Text with prefix and suffix applied

# File 'lib/telegrama/formatter.rb', line 60

def self.apply_prefix_suffix(text)
  prefix = Telegrama.configuration.message_prefix
  suffix = Telegrama.configuration.message_suffix

  result = text.dup
  result = "#{prefix}#{result}" if prefix
  result = "#{result}#{suffix}" if suffix

  result
end

.escape_html(text) ⇒ `String`

Escape HTML special characters

Parameters:

text (String) —

The text with HTML characters

Returns:

(String) —

The text with HTML characters escaped

# File 'lib/telegrama/formatter.rb', line 588

def self.escape_html(text)
  # Precompile HTML escape regex for better performance
  @@html_regex ||= /[<>&]/

  text.gsub(@@html_regex, '<' => '&lt;', '>' => '&gt;', '&' => '&amp;')
end

.escape_markdown_aggressive(text) ⇒ `String`

Fall back to an aggressive approach that escapes everything

Parameters:

text (String) —

The text to escape

Returns:

(String) —

The aggressively escaped text

# File 'lib/telegrama/formatter.rb', line 492

def self.escape_markdown_aggressive(text)
  # Escape all special characters indiscriminately
  # This might break formatting but will at least deliver
  result = text.dup

  # Escape backslashes first
  result.gsub!('\\', '\\\\')

  # Then escape all other special characters
  MARKDOWN_SPECIAL_CHARS.each do |char|
    result.gsub!(char, "\\#{char}")
  end

  result
end

.escape_markdown_v2(text) ⇒ `String`

The main entry point for MarkdownV2 escaping

Parameters:

text (String) —

The text to escape for MarkdownV2 format

Returns:

(String) —

The escaped text

# File 'lib/telegrama/formatter.rb', line 74

def self.escape_markdown_v2(text)
  return text if text.nil? || text.empty?

  # Special handling for messages with suffix like "Sent via Telegrama"
  if text.include?("\n--\nSent via Telegrama")
    # For messages with the standard suffix, we need to keep the dashes unchanged
    parts = text.split("\n--\n")
    if parts.length == 2
      first_part = tokenize_and_format(parts.first)
      return "#{first_part}\n--\n#{parts.last}"
    end
  end

  # For all other text, use the tokenizing approach
  tokenize_and_format(text)
end

.format(text, options = {}) ⇒ `String`

Main formatting entry point - processes text according to configuration and options

Parameters:

text (String) —

The text to format
options (Hash) (defaults to: {}) —

Formatting options to override configuration defaults

Returns:

(String) —

The formatted text

# File 'lib/telegrama/formatter.rb', line 17

def self.format(text, options = {})
  # Merge defaults with any runtime overrides
  defaults = Telegrama.configuration.formatting_options || {}
  opts = defaults.merge(options)

  text = text.to_s

  # Apply prefix and suffix if configured
  text = apply_prefix_suffix(text)

  # Apply HTML escaping first (always safe to do)
  text = escape_html(text) if opts[:escape_html]

  # Apply email obfuscation BEFORE markdown escaping to prevent double-escaping
  text = obfuscate_emails(text) if opts[:obfuscate_emails]

  # Handle Markdown escaping
  if opts[:escape_markdown]
    begin
      text = escape_markdown_v2(text)
    rescue MarkdownError => e
      # Log the error but continue with plain text
      begin
        Telegrama.log_error("Markdown formatting failed: #{e.message}. Falling back to plain text.")
      rescue => _log_error
        # Ignore logging errors in tests
      end
      # Strip all markdown syntax to ensure plain text renders
      text = strip_markdown(text)
      # Force parse_mode to nil in the parent context
      Thread.current[:telegrama_parse_mode_override] = nil
    end
  end

  # Apply truncation last
  text = truncate(text, opts[:truncate]) if opts[:truncate]

  text
end

.html_to_telegram_markdown(html) ⇒ `String`

Convert HTML to Telegram MarkdownV2 format

Parameters:

html (String) —

The HTML text

Returns:

(String) —

The text converted to MarkdownV2 format

# File 'lib/telegrama/formatter.rb', line 545

def self.html_to_telegram_markdown(html)
  # Convert HTML back to Telegram MarkdownV2 format
  # This is a simplified implementation - a real one would be more complex
  text = html.gsub(/<\/?p>/, "\n")
        .gsub(/<strong>(.*?)<\/strong>/, "*\\1*")
        .gsub(/<em>(.*?)<\/em>/, "_\\1_")
        .gsub(/<code>(.*?)<\/code>/, "`\\1`")
        .gsub(/<a href="(.*?)">(.*?)<\/a>/, "[\\2](\\1)")

  # Escape special characters outside of formatting tags
  escape_markdown_v2(text)
end

.obfuscate_emails(text) ⇒ `String`

Obfuscate email addresses in text

Parameters:

text (String) —

The text containing email addresses

Returns:

(String) —

The text with obfuscated email addresses

# File 'lib/telegrama/formatter.rb', line 561

def self.obfuscate_emails(text)
  # Precompile the email regex for better performance
  @@email_regex ||= /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/

  # Extract emails, obfuscate them, and insert them back
  emails = []
  text = text.gsub(@@email_regex) do |email|
    emails << email
    "TELEGRAMA_EMAIL_PLACEHOLDER_#{emails.length - 1}"
  end

  # Replace placeholders with obfuscated emails
  emails.each_with_index do |email, index|
    local, domain = email.split('@')
    obfuscated_local = local.length > 4 ? "#{local[0..2]}...#{local[-1]}" : "#{local[0]}..."
    obfuscated_email = "#{obfuscated_local}@#{domain}"

    # Replace the placeholder with the obfuscated email, ensuring no escapes in the domain
    text = text.gsub("TELEGRAMA_EMAIL_PLACEHOLDER_#{index}", obfuscated_email)
  end

  text
end

.strip_markdown(text) ⇒ `String`

Strip all markdown formatting for plain text delivery

Parameters:

text (String) —

The text with markdown formatting

Returns:

(String) —

The text with markdown formatting removed

# File 'lib/telegrama/formatter.rb', line 511

def self.strip_markdown(text)
  result = text.dup

  # Remove markdown links [text](url) -> text
  result.gsub!(/\[([^\]]*)\]\([^)]*\)/, '\1')

  # Remove triple backtick code blocks (preserve content)
  result.gsub!(/```[a-z]*\n?(.*?)```/m, '\1')

  # Remove inline code backticks (preserve content)
  result.gsub!(/`([^`]*)`/, '\1')

  # Remove bold formatting (both ** and *)
  result.gsub!(/\*\*([^*]*)\*\*/, '\1')
  result.gsub!(/\*([^*]*)\*/, '\1')

  # Remove italic formatting (both __ and _)
  result.gsub!(/__([^_]*)__/, '\1')
  result.gsub!(/(?<![\\])_([^_]*)_/, '\1')

  # Remove strikethrough
  result.gsub!(/~~([^~]*)~~/, '\1')
  result.gsub!(/~([^~]*)~/, '\1')

  # Remove any remaining unmatched formatting characters at word boundaries
  # but preserve them in the middle of words (like file_name)
  result.gsub!(/(?<=\s)[*_~`]+|[*_~`]+(?=\s|$)/, '')

  result
end

.tokenize_and_format(text) ⇒ `String`

Tokenize and format the text using a state machine approach

Parameters:

text (String) —

The text to process

Returns:

(String) —

The processed text

# File 'lib/telegrama/formatter.rb', line 94

def self.tokenize_and_format(text)
  # Special handling for links with the Markdown format [text](url)
  # Process only complete links to ensure incomplete links are handled by the state machine
  link_fixed_text = text.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
    # Extract link text and URL
    text_part = $1
    url_part = $2

    # Handle escaping within link text
    text_part = text_part.gsub(/([_*\[\]()~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Escape special characters in URL (except parentheses which define URL boundaries)
    url_part = url_part.gsub(/([_*\[\]~`>#+=|{}.!\\])/) { |m| "\\#{m}" }

    # Rebuild the link with proper escaping
    "[#{text_part}](#{url_part})"
  end

  # Process the text with fixed links using tokenizer
  tokenizer = MarkdownTokenizer.new(link_fixed_text)
  tokenizer.process
end

.truncate(text, max_length) ⇒ `String`

Truncate text to a maximum length

Parameters:

text (String) —

The text to truncate
max_length (Integer, nil) —

The maximum length or nil for no truncation

Returns:

(String) —

The truncated text

# File 'lib/telegrama/formatter.rb', line 599

def self.truncate(text, max_length)
  return text if !max_length || text.length <= max_length
  text[0, max_length]
end

Module: Telegrama::Formatter

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.apply_prefix_suffix(text) ⇒ String

.escape_html(text) ⇒ String

.escape_markdown_aggressive(text) ⇒ String

.escape_markdown_v2(text) ⇒ String

.format(text, options = {}) ⇒ String

.html_to_telegram_markdown(html) ⇒ String

.obfuscate_emails(text) ⇒ String

.strip_markdown(text) ⇒ String

.tokenize_and_format(text) ⇒ String

.truncate(text, max_length) ⇒ String

.apply_prefix_suffix(text) ⇒ `String`

.escape_html(text) ⇒ `String`

.escape_markdown_aggressive(text) ⇒ `String`

.escape_markdown_v2(text) ⇒ `String`

.format(text, options = {}) ⇒ `String`

.html_to_telegram_markdown(html) ⇒ `String`

.obfuscate_emails(text) ⇒ `String`

.strip_markdown(text) ⇒ `String`

.tokenize_and_format(text) ⇒ `String`

.truncate(text, max_length) ⇒ `String`