Module: DataRedactor
- Defined in:
- lib/data_redactor.rb,
lib/data_redactor/version.rb,
ext/data_redactor/data_redactor.c
Defined Under Namespace
Classes: InvalidPatternError, UnknownTagError
Constant Summary collapse
- TAGS =
{ credentials: TAG_CREDENTIALS, financial: TAG_FINANCIAL, tax_id: TAG_TAX_ID, national_id: TAG_NATIONAL_ID, contact: TAG_CONTACT, network: TAG_NETWORK, travel: TAG_TRAVEL, other: TAG_OTHER, custom: TAG_CUSTOM }.freeze
- CAPTURE_GROUP_RE =
Capture groups break boundary-wrapper group index assumptions ([1],,[3] shift).
/(?<!\\)\((?!\?:)/.freeze
- RUBY_ONLY_SYNTAX_RE =
Ruby regex syntax that has no POSIX ERE equivalent.
/\\[dDwWsShHbB]|\(\?[<!=]|\(\?<[a-zA-Z]|\(\?[imx]|[*+?]\?/.freeze
- PLACEHOLDER_DEFAULT =
"[REDACTED]"- VERSION =
"0.5.0"- PH_MODE_PLAIN =
Placeholder mode constants.
INT2NUM(PLACEHOLDER_MODE_PLAIN)
- PH_MODE_TAGGED =
INT2NUM(PLACEHOLDER_MODE_TAGGED)
- PH_MODE_HASH =
INT2NUM(PLACEHOLDER_MODE_HASH)
- TAG_CREDENTIALS =
Expose tag bitmask values so the Ruby wrapper can build the mask.
INT2NUM(TAG_CREDENTIALS)
- TAG_FINANCIAL =
INT2NUM(TAG_FINANCIAL)
- TAG_TAX_ID =
INT2NUM(TAG_TAX_ID)
- TAG_NATIONAL_ID =
INT2NUM(TAG_NATIONAL_ID)
- TAG_CONTACT =
INT2NUM(TAG_CONTACT)
- TAG_NETWORK =
INT2NUM(TAG_NETWORK)
- TAG_TRAVEL =
INT2NUM(TAG_TRAVEL)
- TAG_OTHER =
INT2NUM(TAG_OTHER)
- TAG_CUSTOM =
INT2NUM(TAG_CUSTOM)
- TAG_ALL =
INT2NUM(TAG_ALL)
Class Method Summary collapse
-
._add_pattern(rb_name, rb_source, rb_tag_bit, rb_boundary) ⇒ Object
DataRedactor._add_pattern(name, source, tag_bit, boundary) -> nil.
-
._clear_custom_patterns ⇒ Object
DataRedactor._clear_custom_patterns -> nil.
-
._custom_patterns ⇒ Object
DataRedactor._custom_patterns -> Array<Hash>.
-
._redact(rb_text, rb_mask, rb_ph_mode, rb_ph_str) ⇒ Object
DataRedactor._redact(text, mask, ph_mode, ph_str) -> String.
-
._remove_pattern(rb_name) ⇒ Object
DataRedactor._remove_pattern(name) -> true/false.
-
._scan(rb_text, rb_mask) ⇒ Object
DataRedactor._scan(text, mask) -> Hash.
-
.add_pattern(name:, regex:, tag: :custom, boundary: false) ⇒ Object
Add (or replace) a custom redaction pattern.
- .bits_for(tag_list) ⇒ Object
- .clear_custom_patterns! ⇒ Object
- .custom_patterns ⇒ Object
- .redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT) ⇒ Object
- .remove_pattern(name) ⇒ Object
-
.resolve_placeholder(placeholder) ⇒ Object
Returns [ph_mode_int, ph_str] for the C layer.
-
.scan(text, only: nil, except: nil) ⇒ Object
Scan text without necessarily redacting it.
- .tags ⇒ Object
Class Method Details
._add_pattern(rb_name, rb_source, rb_tag_bit, rb_boundary) ⇒ Object
DataRedactor._add_pattern(name, source, tag_bit, boundary) -> nil
Compile ‘source` as POSIX ERE (with boundary wrapper when boundary=1), store under `name`. Replaces any existing pattern with the same name. Raises DataRedactor::InvalidPatternError on regcomp failure.
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 |
# File 'ext/data_redactor/data_redactor.c', line 550
static VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
VALUE rb_tag_bit, VALUE rb_boundary) {
Check_Type(rb_name, T_STRING);
Check_Type(rb_source, T_STRING);
const char *name = StringValueCStr(rb_name);
const char *source = StringValueCStr(rb_source);
int tag_bit = NUM2INT(rb_tag_bit);
int boundary = NUM2INT(rb_boundary);
/* Build the pattern string (wrap boundary if requested) */
char *pat_to_compile;
char *wrapped = NULL;
if (boundary) {
wrapped = wrap_boundary(source);
if (!wrapped) rb_raise(rb_eNoMemError, "wrap_boundary allocation failed");
pat_to_compile = wrapped;
} else {
pat_to_compile = (char *)source;
}
regex_t compiled;
int ret = regcomp(&compiled, pat_to_compile, REG_EXTENDED);
free(wrapped);
if (ret != 0) {
char errbuf[256];
regerror(ret, &compiled, errbuf, sizeof(errbuf));
regfree(&compiled);
VALUE eClass = rb_const_get(rb_define_module("DataRedactor"),
rb_intern("InvalidPatternError"));
rb_raise(eClass, "%s", errbuf);
}
/* Replace existing or append */
int idx = find_custom_by_name(name);
if (idx >= 0) {
free_custom_at(idx);
} else {
if (custom_count >= custom_cap) {
int new_cap = custom_cap == 0 ? 8 : custom_cap * 2;
custom_pattern_t *tmp = (custom_pattern_t *)realloc(
custom_patterns, sizeof(custom_pattern_t) * new_cap);
if (!tmp) {
regfree(&compiled);
rb_raise(rb_eNoMemError, "custom_patterns realloc failed");
}
custom_patterns = tmp;
custom_cap = new_cap;
}
idx = custom_count++;
}
custom_patterns[idx].name = strdup(name);
custom_patterns[idx].source = strdup(source);
custom_patterns[idx].compiled = compiled;
custom_patterns[idx].tag = tag_bit;
custom_patterns[idx].boundary = boundary;
if (!custom_patterns[idx].name || !custom_patterns[idx].source) {
rb_raise(rb_eNoMemError, "strdup failed");
}
return Qnil;
}
|
._clear_custom_patterns ⇒ Object
DataRedactor._clear_custom_patterns -> nil
642 643 644 645 646 647 648 |
# File 'ext/data_redactor/data_redactor.c', line 642
static VALUE rb_clear_custom_patterns(VALUE self) {
for (int i = 0; i < custom_count; i++) {
free_custom_at(i);
}
custom_count = 0;
return Qnil;
}
|
._custom_patterns ⇒ Object
DataRedactor._custom_patterns -> Array<Hash>
Returns [source:, tag_bit:, boundary:, …] for each custom pattern.
655 656 657 658 659 660 661 662 663 664 665 666 |
# File 'ext/data_redactor/data_redactor.c', line 655
static VALUE rb_custom_patterns(VALUE self) {
VALUE arr = rb_ary_new_capa(custom_count);
for (int i = 0; i < custom_count; i++) {
VALUE h = rb_hash_new();
rb_hash_aset(h, ID2SYM(rb_intern("name")), rb_str_new_cstr(custom_patterns[i].name));
rb_hash_aset(h, ID2SYM(rb_intern("source")), rb_str_new_cstr(custom_patterns[i].source));
rb_hash_aset(h, ID2SYM(rb_intern("tag_bit")), INT2NUM(custom_patterns[i].tag));
rb_hash_aset(h, ID2SYM(rb_intern("boundary")), custom_patterns[i].boundary ? Qtrue : Qfalse);
rb_ary_push(arr, h);
}
return arr;
}
|
._redact(rb_text, rb_mask, rb_ph_mode, rb_ph_str) ⇒ Object
DataRedactor._redact(text, mask, ph_mode, ph_str) -> String
‘mask` — integer bitmask of TAG_* values (only / except filtering). `ph_mode` — 0 = plain string, 1 = tagged “[REDACTED:TAG]”, 2 = hash “[TAG_xxxx]”. `ph_str` — the plain string for mode 0; ignored for modes 1 and 2.
The Ruby wrapper builds all four arguments and is the public API.
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 |
# File 'ext/data_redactor/data_redactor.c', line 796
static VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text, VALUE rb_mask,
VALUE rb_ph_mode, VALUE rb_ph_str) {
Check_Type(rb_text, T_STRING);
Check_Type(rb_ph_str, T_STRING);
int mask = NUM2INT(rb_mask);
int ph_mode = NUM2INT(rb_ph_mode);
const char *ph_str_plain = StringValueCStr(rb_ph_str);
const char *input = StringValueCStr(rb_text);
char *working = strdup(input);
if (!working) rb_raise(rb_eNoMemError, "strdup failed");
placeholder_t ph;
ph.mode = ph_mode;
for (int i = 0; i < NUM_PATTERNS; i++) {
if ((pattern_tags[i] & mask) == 0) continue;
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
? ph_str_plain
: tag_name_for_bit(pattern_tags[i]);
char *result = replace_all_matches(&compiled_patterns[i], working,
boundary_wrapped[i], &ph);
free(working);
if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
working = result;
}
for (int i = 0; i < custom_count; i++) {
if ((custom_patterns[i].tag & mask) == 0) continue;
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
? ph_str_plain
: tag_name_for_bit(custom_patterns[i].tag);
char *result = replace_all_matches(&custom_patterns[i].compiled, working,
custom_patterns[i].boundary, &ph);
free(working);
if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
working = result;
}
VALUE rb_result = rb_str_new_cstr(working);
free(working);
return rb_result;
}
|
._remove_pattern(rb_name) ⇒ Object
DataRedactor._remove_pattern(name) -> true/false
Remove the named custom pattern. Returns true if found and removed.
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 |
# File 'ext/data_redactor/data_redactor.c', line 621
static VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
Check_Type(rb_name, T_STRING);
const char *name = StringValueCStr(rb_name);
int idx = find_custom_by_name(name);
if (idx < 0) return Qfalse;
free_custom_at(idx);
/* Shift remaining entries left */
for (int i = idx; i < custom_count - 1; i++) {
custom_patterns[i] = custom_patterns[i + 1];
}
custom_count--;
return Qtrue;
}
|
._scan(rb_text, rb_mask) ⇒ Object
DataRedactor._scan(text, mask) -> Hash
Returns { redacted: String, matches: Array<Hash> } where each match hash is:
{ tag: Symbol, name: String, value: String, start: Integer, length: Integer }
Matches are reported in the order they are consumed by the sequential redaction loop (built-ins first, most-specific to most-generic; then custom patterns). ‘start` and `length` refer to byte positions in the original input string. Because patterns run sequentially on a shrinking/expanding working buffer, positions are tracked relative to the original by maintaining a running offset.
853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 |
# File 'ext/data_redactor/data_redactor.c', line 853
static VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_mask) {
Check_Type(rb_text, T_STRING);
int mask = NUM2INT(rb_mask);
const char *input = StringValueCStr(rb_text);
size_t input_len = strlen(input);
/* Working buffer — we redact with the default plain placeholder so the
* scan result also contains the redacted string. */
static const placeholder_t ph_default = { PLACEHOLDER_MODE_PLAIN, "[REDACTED]" };
char *working = strdup(input);
if (!working) rb_raise(rb_eNoMemError, "strdup failed");
VALUE matches_arr = rb_ary_new();
/*
* To map working-buffer positions back to original-string positions we
* maintain a log of every replacement already applied. Each entry records
* where in the *working* buffer the replacement started (after all prior
* replacements) and how many bytes were removed (orig_len) vs. inserted
* (always 10, the length of "[REDACTED]").
*
* For a new match at working position W:
* cumulative_shift_before_W = sum of (10 - orig_len) for all prior
* replacements whose working_pos <= W
* original_pos = W - cumulative_shift_before_W
*
* Replacements are appended in order so the log is already sorted by
* working_pos; we just walk it linearly per match.
*/
typedef struct { long wpos; long orig_len; } repl_t;
repl_t *repl_log = NULL;
int repl_count = 0;
int repl_cap = 0;
#define REPL_LOG_PUSH(_wpos, _olen) do { \
if (repl_count >= repl_cap) { \
int _nc = repl_cap == 0 ? 16 : repl_cap * 2; \
repl_t *_t = (repl_t *)realloc(repl_log, sizeof(repl_t) * _nc); \
if (!_t) { free(repl_log); free(working); rb_raise(rb_eNoMemError, "repl_log"); } \
repl_log = _t; repl_cap = _nc; \
} \
repl_log[repl_count].wpos = (_wpos); \
repl_log[repl_count].orig_len = (_olen); \
repl_count++; \
} while (0)
/* Map a position in the current working buffer to original-string position. */
#define WORKING_TO_ORIG(_wpos) ({ \
long _shift = 0; \
for (int _ri = 0; _ri < repl_count; _ri++) { \
if (repl_log[_ri].wpos <= (_wpos)) \
_shift += 10 - repl_log[_ri].orig_len; \
} \
(_wpos) - _shift; \
})
/* Collect matches for one pattern on the current working buffer, translate
* positions to original coordinates, then do the replacement. */
#define COLLECT_AND_REPLACE(pat, use_bnd, tag_bit, pat_name) do { \
const char *_cur = working; \
regmatch_t _m[4]; \
while (regexec((pat), _cur, 4, _m, 0) == 0) { \
regoff_t _fso = _m[0].rm_so, _feo = _m[0].rm_eo; \
if (_fso < 0 || _feo < _fso) break; \
regoff_t _cso = _fso, _ceo = _feo; \
if (use_bnd) { \
if (_m[1].rm_so >= 0 && _m[1].rm_eo > _m[1].rm_so) \
_cso = _m[1].rm_eo; \
if (_m[3].rm_so >= 0 && _m[3].rm_eo > _m[3].rm_so) \
_ceo = _m[3].rm_so; \
} \
size_t _vlen = (size_t)(_ceo - _cso); \
long _wpos = (long)(_cur - working) + (long)_cso; \
long _orig = WORKING_TO_ORIG(_wpos); \
VALUE _match = rb_hash_new(); \
rb_hash_aset(_match, ID2SYM(rb_intern("tag")), \
ID2SYM(rb_intern(tag_name_for_bit(tag_bit)))); \
rb_hash_aset(_match, ID2SYM(rb_intern("name")), \
rb_str_new_cstr(pat_name)); \
rb_hash_aset(_match, ID2SYM(rb_intern("value")), \
rb_str_new(_cur + _cso, _vlen)); \
rb_hash_aset(_match, ID2SYM(rb_intern("start")), \
LONG2NUM(_orig)); \
rb_hash_aset(_match, ID2SYM(rb_intern("length")), \
LONG2NUM((long)_vlen)); \
rb_ary_push(matches_arr, _match); \
/* Log this replacement; wpos advances by 10 for subsequent entries */ \
REPL_LOG_PUSH(_wpos, (long)_vlen); \
/* Re-anchor cursor: skip past the full match in working buf */ \
if (_feo == _fso) { if (*_cur) _cur++; else break; } \
else _cur += _feo; \
} \
char *_next = replace_all_matches((pat), working, (use_bnd), &ph_default); \
free(working); \
if (!_next) { free(repl_log); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); } \
working = _next; \
} while (0)
for (int i = 0; i < NUM_PATTERNS; i++) {
if ((pattern_tags[i] & mask) == 0) continue;
COLLECT_AND_REPLACE(&compiled_patterns[i], boundary_wrapped[i],
pattern_tags[i], pattern_names[i]);
}
for (int i = 0; i < custom_count; i++) {
if ((custom_patterns[i].tag & mask) == 0) continue;
COLLECT_AND_REPLACE(&custom_patterns[i].compiled,
custom_patterns[i].boundary,
custom_patterns[i].tag, custom_patterns[i].name);
}
#undef COLLECT_AND_REPLACE
#undef WORKING_TO_ORIG
#undef REPL_LOG_PUSH
free(repl_log);
VALUE result = rb_hash_new();
VALUE rb_redacted = rb_str_new_cstr(working);
free(working);
rb_hash_aset(result, ID2SYM(rb_intern("redacted")), rb_redacted);
rb_hash_aset(result, ID2SYM(rb_intern("matches")), matches_arr);
return result;
(void)input_len; /* suppress unused-variable warning */
}
|
.add_pattern(name:, regex:, tag: :custom, boundary: false) ⇒ Object
Add (or replace) a custom redaction pattern.
name: unique identifier string regex: String (POSIX ERE) or Regexp; Ruby-only syntax raises InvalidPatternError tag: one of the TAGS keys (default :custom), or any built-in tag boundary: wrap with word-boundary guards; incompatible with capture groups
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/data_redactor.rb', line 81 def add_pattern(name:, regex:, tag: :custom, boundary: false) raise ArgumentError, "name must be a non-empty String" \ unless name.is_a?(String) && !name.empty? source = case regex when String then regex when Regexp then regex.source else raise ArgumentError, "regex must be a String or Regexp, got #{regex.class}" end if source =~ RUBY_ONLY_SYNTAX_RE raise InvalidPatternError, "pattern #{name.inspect} uses Ruby-only syntax (#{$&.inspect}); " \ "use POSIX ERE — no \\d, \\s, \\w, \\b, lookaround, non-greedy, or named groups" end if boundary && source =~ CAPTURE_GROUP_RE raise InvalidPatternError, "pattern #{name.inspect} has capture groups and cannot use boundary: true" end tag_bit = TAGS[tag] or raise UnknownTagError, "unknown tag #{tag.inspect}; valid tags: #{TAGS.keys.inspect}" _add_pattern(name, source, tag_bit, boundary ? 1 : 0) end |
.bits_for(tag_list) ⇒ Object
123 124 125 126 127 128 129 |
# File 'lib/data_redactor.rb', line 123 def bits_for(tag_list) Array(tag_list).inject(0) do |acc, tag| bit = TAGS[tag] or raise UnknownTagError, "unknown tag #{tag.inspect}; valid tags: #{TAGS.keys.inspect}" acc | bit end end |
.clear_custom_patterns! ⇒ Object
119 120 121 |
# File 'lib/data_redactor.rb', line 119 def clear_custom_patterns! _clear_custom_patterns end |
.custom_patterns ⇒ Object
112 113 114 115 116 117 |
# File 'lib/data_redactor.rb', line 112 def custom_patterns _custom_patterns.map do |h| { name: h[:name], source: h[:source], tag: TAGS.key(h[:tag_bit]) || :custom, boundary: h[:boundary] } end end |
.redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/data_redactor.rb', line 34 def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT) raise ArgumentError, "pass only: or except:, not both" if only && except mask = if only bits_for(only) elsif except TAG_ALL & ~bits_for(except) else TAG_ALL end ph_mode, ph_str = resolve_placeholder(placeholder) _redact(text, mask, ph_mode, ph_str) end |
.remove_pattern(name) ⇒ Object
108 109 110 |
# File 'lib/data_redactor.rb', line 108 def remove_pattern(name) _remove_pattern(name.to_s) end |
.resolve_placeholder(placeholder) ⇒ Object
Returns [ph_mode_int, ph_str] for the C layer.
placeholder: "***" -> plain string
placeholder: :tagged -> "[REDACTED:TAGNAME]"
placeholder: :hash -> "[TAGNAME_xxxx]"
135 136 137 138 139 140 141 142 143 144 |
# File 'lib/data_redactor.rb', line 135 def resolve_placeholder(placeholder) case placeholder when :tagged then [PH_MODE_TAGGED, ""] when :hash then [PH_MODE_HASH, ""] when String then [PH_MODE_PLAIN, placeholder] else raise ArgumentError, "placeholder must be a String, :tagged, or :hash — got #{placeholder.inspect}" end end |
.scan(text, only: nil, except: nil) ⇒ Object
Scan text without necessarily redacting it.
Returns { redacted: String, matches: [name:, value:, start:, length:, …] } The :tag value is a Symbol matching one of DataRedactor.tags. :start and :length are byte offsets into the original string.
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/data_redactor.rb', line 55 def scan(text, only: nil, except: nil) raise ArgumentError, "pass only: or except:, not both" if only && except mask = if only bits_for(only) elsif except TAG_ALL & ~bits_for(except) else TAG_ALL end result = _scan(text, mask) # Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS result[:matches].each do |m| m[:tag] = m[:tag].to_s.downcase.to_sym end result end |
.tags ⇒ Object
30 31 32 |
# File 'lib/data_redactor.rb', line 30 def TAGS.keys end |