Module: Philiprehberger::RegexLib

Defined in:
lib/philiprehberger/regex_lib.rb,
lib/philiprehberger/regex_lib/version.rb,
lib/philiprehberger/regex_lib/patterns.rb

Defined Under Namespace

Classes: Error, Result

Constant Summary collapse

VERSION =
'0.4.0'
EMAIL =

Email address (local@domain.tld) with named captures

%r{\A(?<local>[a-zA-Z0-9.!\#$%&'*+/=?^_`{|}~-]+)@(?<domain>[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*\.[a-zA-Z]{2,})\z}
URL =

HTTP/HTTPS URL with named captures

%r{\A(?<scheme>https?)://(?<host>[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)(?::(?<port>\d{1,5}))?(?<path>/[^\s?#]*)?(?:\?(?<query>[^\s#]*))?(?:#(?<fragment>\S*))?\z}
IPV4 =

IPv4 address (0-255 per octet) with named captures

/\A(?<octet1>25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?<octet2>25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?<octet3>25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?<octet4>25[0-5]|2[0-4]\d|[01]?\d\d?)\z/
IPV6 =

IPv6 address (simplified: full, compressed, and loopback forms)

/\A(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}|:(?::[0-9a-fA-F]{1,4}){1,7}|::)\z/
UUID =

UUID v4 (8-4-4-4-12 hex format)

/\A[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\z/
PHONE_E164 =

E.164 phone number (+1234567890, 7-15 digits after +)

/\A\+[1-9]\d{6,14}\z/
DATE_ISO =

ISO 8601 date (YYYY-MM-DD) with named captures

/\A(?<year>\d{4})-(?<month>0[1-9]|1[0-2])-(?<day>0[1-9]|[12]\d|3[01])\z/
TIME_ISO =

ISO 8601 time (HH:MM:SS with optional timezone)

/\A(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d(?:Z|[+-](?:[01]\d|2[0-3]):[0-5]\d)?\z/
DATETIME_ISO =

ISO 8601 datetime (YYYY-MM-DDTHH:MM:SS with optional timezone)

/\A\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d(?:Z|[+-](?:[01]\d|2[0-3]):[0-5]\d)?\z/
HEX_COLOR =

Hex color (#RGB or #RRGGBB)

/\A#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{6})\z/
CREDIT_CARD =

Credit card number (13-19 digits, optional spaces or dashes)

/\A\d[ -]?\d{3,4}[ -]?\d{3,4}[ -]?\d{3,4}[ -]?\d{0,4}\z/
SSN =

US Social Security Number (XXX-XX-XXXX)

/\A\d{3}-\d{2}-\d{4}\z/
MAC_ADDRESS =

MAC address (AA:BB:CC:DD:EE:FF or AA-BB-CC-DD-EE-FF)

/\A[0-9a-fA-F]{2}(?:[-:][0-9a-fA-F]{2}){5}\z/
SEMANTIC_VERSION =

Semantic version (major.minor.patch with optional pre-release and build metadata)

/\A(?<major>0|[1-9]\d*)\.(?<minor>0|[1-9]\d*)\.(?<patch>0|[1-9]\d*)(?:-(?<prerelease>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?(?:\+(?<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?\z/
SLUG =

URL slug (lowercase alphanumeric and hyphens)

/\A[a-z0-9]+(?:-[a-z0-9]+)*\z/
IBAN =

International Bank Account Number with named captures

/\A(?<country>[A-Z]{2})(?<check>\d{2})(?<bban>[A-Z0-9]{1,30})\z/
DOMAIN =

Valid domain name (labels separated by dots, TLD 2+ chars)

/\A[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*\.[a-zA-Z]{2,}\z/
FILE_PATH_UNIX =

Unix file path (/foo/bar/baz.txt)

%r{\A/(?:[^\0/]+/?)*\z}
FILE_PATH_WINDOWS =

Windows file path (C:foobarbaz.txt)

Regexp.new('\A[a-zA-Z]:\\\\(?:[^\x00\\\\/:*?"<>|]+\\\\)*[^\x00\\\\/:*?"<>|]*\z')
JWT =

JSON Web Token (3 base64url segments separated by dots)

/\A[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\z/
BASE64 =

Base64 encoded string (full match)

%r{\A(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?\z}
/\A\[(?<text>[^\]]+)\]\((?<url>[^)]+)\)\z/
HASHTAG =

Social media hashtag (#word)

/\A#[a-zA-Z]\w*\z/
MENTION =

Social media mention (@username)

/\A@[a-zA-Z]\w*\z/
JSON_STRING =

Double-quoted JSON string with escape support

/\A"(?:[^"\\]|\\.)*"\z/
HTML_TAG =

HTML opening/closing tag

%r{\A</?[a-zA-Z][a-zA-Z0-9]*(?:\s+[a-zA-Z][a-zA-Z0-9-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]+))?)*\s*/?>}
CRON_EXPRESSION =

Cron expression (minute hour day month weekday, with optional /step)

%r{\A(\*|[0-5]?\d)(/\d+)?\s+(\*|[01]?\d|2[0-3])(/\d+)?\s+(\*|[1-9]|[12]\d|3[01])(/\d+)?\s+(\*|[1-9]|1[0-2])(/\d+)?\s+(\*|[0-7])(/\d+)?\z}
CIDR =

CIDR notation (IPv4 address with prefix length 0-32)

%r{\A(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)/(?:3[0-2]|[12]?\d)\z}
PATTERNS =

Map of pattern names to constants for helper methods

{
  email: EMAIL,
  url: URL,
  ipv4: IPV4,
  ipv6: IPV6,
  uuid: UUID,
  phone_e164: PHONE_E164,
  date_iso: DATE_ISO,
  time_iso: TIME_ISO,
  datetime_iso: DATETIME_ISO,
  hex_color: HEX_COLOR,
  credit_card: CREDIT_CARD,
  ssn: SSN,
  mac_address: MAC_ADDRESS,
  semantic_version: SEMANTIC_VERSION,
  slug: SLUG,
  iban: IBAN,
  domain: DOMAIN,
  file_path_unix: FILE_PATH_UNIX,
  file_path_windows: FILE_PATH_WINDOWS,
  jwt: JWT,
  base64: BASE64,
  markdown_link: MARKDOWN_LINK,
  hashtag: HASHTAG,
  mention: MENTION,
  json_string: JSON_STRING,
  html_tag: HTML_TAG,
  cron_expression: CRON_EXPRESSION,
  cidr: CIDR
}.freeze

Class Method Summary collapse

Class Method Details

.combine(*pattern_names, name: nil) ⇒ Regexp

Combine multiple patterns with alternation.

Parameters:

  • pattern_names (Array<Symbol>)

    the pattern names to combine

  • name (Symbol, nil) (defaults to: nil)

    optional name to store the combined pattern

Returns:

  • (Regexp)

    the combined pattern

Raises:

  • (Error)

    if any pattern name is not recognized



78
79
80
81
82
83
84
85
86
87
88
# File 'lib/philiprehberger/regex_lib.rb', line 78

def self.combine(*pattern_names, name: nil)
  patterns = pattern_names.map { |pn| resolve_pattern!(pn) }
  sources = patterns.map { |p| "(?:#{p.source.delete_prefix('\A').delete_suffix('\z')})" }
  combined = Regexp.new("\\A(?:#{sources.join('|')})\\z")

  if name
    @custom_patterns[name] = combined
  end

  combined
end

.extract(pattern_name, string) ⇒ Hash{String => String}, ...

Extract named captures from a string using a named pattern.

Parameters:

  • pattern_name (Symbol)

    the pattern name (e.g. :date_iso, :semantic_version)

  • string (String)

    the string to extract from

Returns:

  • (Hash{String => String}, String, nil)

    hash of named captures, full match, or nil

Raises:

  • (Error)

    if the pattern name is not recognized



44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/philiprehberger/regex_lib.rb', line 44

def self.extract(pattern_name, string)
  pattern = resolve_pattern!(pattern_name)
  match = pattern.match(string)
  return nil unless match

  named = match.named_captures
  if named.empty?
    match[0]
  else
    named
  end
end

.extract_all(pattern_name, string) ⇒ Array<String>

Find all matches of a named pattern in a string.

Parameters:

  • pattern_name (Symbol)

    the pattern name (e.g. :email, :url)

  • string (String)

    the string to search

Returns:

  • (Array<String>)

    all matches found in the string

Raises:

  • (Error)

    if the pattern name is not recognized



63
64
65
66
67
68
69
70
# File 'lib/philiprehberger/regex_lib.rb', line 63

def self.extract_all(pattern_name, string)
  pattern = resolve_pattern!(pattern_name)
  # Remove anchors and convert named captures to non-capturing groups
  source = pattern.source.delete_prefix('\A').delete_suffix('\z')
  source = source.gsub(/\(\?<[^>]+>/, '(?:')
  unanchored = Regexp.new(source, pattern.options)
  string.scan(unanchored)
end

.highlight(pattern_name, string, before: '**', after: '**') ⇒ String

Wrap matches of a named pattern with delimiter strings.

Parameters:

  • pattern_name (Symbol)

    the pattern name

  • string (String)

    the string to search

  • before (String) (defaults to: '**')

    text to insert before each match (default: ‘**’)

  • after (String) (defaults to: '**')

    text to insert after each match (default: ‘**’)

Returns:

  • (String)

    the string with matches wrapped

Raises:

  • (Error)

    if the pattern name is not recognized



209
210
211
212
# File 'lib/philiprehberger/regex_lib.rb', line 209

def self.highlight(pattern_name, string, before: '**', after: '**')
  pat = unanchored_pattern(resolve_pattern!(pattern_name))
  string.gsub(pat) { |match| "#{before}#{match}#{after}" }
end

.mask(pattern_name, string, char: '*', keep: 4) ⇒ String

Mask matches of a named pattern, keeping the last N characters visible.

Parameters:

  • pattern_name (Symbol)

    the pattern name

  • string (String)

    the string to search

  • char (String) (defaults to: '*')

    the masking character (default: ‘*’)

  • keep (Integer) (defaults to: 4)

    number of trailing characters to keep visible (default: 4)

Returns:

  • (String)

    the string with matches masked

Raises:

  • (Error)

    if the pattern name is not recognized



190
191
192
193
194
195
196
197
198
199
# File 'lib/philiprehberger/regex_lib.rb', line 190

def self.mask(pattern_name, string, char: '*', keep: 4)
  pat = unanchored_pattern(resolve_pattern!(pattern_name))
  string.gsub(pat) do |match|
    if match.length <= keep
      char * match.length
    else
      (char * (match.length - keep)) + match[-keep..]
    end
  end
end

.match?(pattern_name, string) ⇒ Boolean

Test whether a string matches a named pattern.

Parameters:

  • pattern_name (Symbol)

    the pattern name (e.g. :email, :url, :ipv4)

  • string (String)

    the string to test

Returns:

  • (Boolean)

    true if the string matches the pattern

Raises:

  • (Error)

    if the pattern name is not recognized



33
34
35
36
# File 'lib/philiprehberger/regex_lib.rb', line 33

def self.match?(pattern_name, string)
  pattern = resolve_pattern!(pattern_name)
  pattern.match?(string)
end

.pattern(name) ⇒ Regexp

Look up a pattern by symbol name.

Parameters:

  • name (Symbol)

    the pattern name

Returns:

  • (Regexp)

    the pattern

Raises:

  • (Error)

    if the pattern name is not recognized



95
96
97
# File 'lib/philiprehberger/regex_lib.rb', line 95

def self.pattern(name)
  PATTERNS.fetch(name) { @custom_patterns.fetch(name) { raise Error, "Unknown pattern: #{name.inspect}" } }
end

.pattern_namesArray<Symbol>

List all built-in pattern names.

Returns the symbol names of every pattern available via pattern, sorted ascending. A fresh array is returned on each call so callers may mutate it freely without affecting the underlying pattern registry.

Returns:

  • (Array<Symbol>)

    sorted list of built-in pattern names



107
108
109
# File 'lib/philiprehberger/regex_lib.rb', line 107

def self.pattern_names
  PATTERNS.keys.sort
end

.replace(pattern_name, string, replacement) ⇒ String

Replace the first match of a named pattern in a string.

Parameters:

  • pattern_name (Symbol)

    the pattern name

  • string (String)

    the string to search

  • replacement (String)

    the replacement text

Returns:

  • (String)

    the string with the first match replaced

Raises:

  • (Error)

    if the pattern name is not recognized



165
166
167
168
# File 'lib/philiprehberger/regex_lib.rb', line 165

def self.replace(pattern_name, string, replacement)
  pat = unanchored_pattern(resolve_pattern!(pattern_name))
  string.sub(pat, replacement)
end

.replace_all(pattern_name, string, replacement) ⇒ String

Replace all matches of a named pattern in a string.

Parameters:

  • pattern_name (Symbol)

    the pattern name

  • string (String)

    the string to search

  • replacement (String)

    the replacement text

Returns:

  • (String)

    the string with all matches replaced

Raises:

  • (Error)

    if the pattern name is not recognized



177
178
179
180
# File 'lib/philiprehberger/regex_lib.rb', line 177

def self.replace_all(pattern_name, string, replacement)
  pat = unanchored_pattern(resolve_pattern!(pattern_name))
  string.gsub(pat, replacement)
end

.reset_custom_patterns!Object

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Reset custom patterns (useful for testing)



216
217
218
# File 'lib/philiprehberger/regex_lib.rb', line 216

def self.reset_custom_patterns!
  @custom_patterns = {}
end

.scan(string) ⇒ Array<Hash>

Scan a string and return all recognized pattern matches.

Parameters:

  • string (String)

    the string to scan

Returns:

  • (Array<Hash>)

    array of hashes with :type, :value, :position



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/philiprehberger/regex_lib.rb', line 138

def self.scan(string)
  results = []

  PATTERNS.each do |name, pattern|
    source = pattern.source.delete_prefix('\A').delete_suffix('\z')
    source = source.gsub(/\(\?<[^>]+>/, '(?:')
    unanchored = Regexp.new(source, pattern.options)
    string.scan(unanchored) do
      match = Regexp.last_match
      results << {
        type: name,
        value: match[0],
        position: match.begin(0)..match.end(0)
      }
    end
  end

  results.sort_by { |r| r[:position].begin }
end

.validate(pattern_name, string) ⇒ Result

Validate a string against a named pattern with specific failure reasons.

Parameters:

  • pattern_name (Symbol)

    the pattern name

  • string (String)

    the string to validate

Returns:

  • (Result)

    result with valid? and error

Raises:

  • (Error)

    if the pattern name is not recognized



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/philiprehberger/regex_lib.rb', line 117

def self.validate(pattern_name, string)
  resolve_pattern!(pattern_name) # ensure pattern exists

  case pattern_name
  when :email
    validate_email(string)
  when :url
    validate_url(string)
  when :ipv4
    validate_ipv4(string)
  when :semantic_version
    validate_semantic_version(string)
  else
    generic_validate(pattern_name, string)
  end
end