Module: Dejunk
Constant Summary collapse
- MASH_CHARS =
All characters on the middle row of a QWERTY keyboard
'ASDFGHJKLasdfghjkl;: '- MASH_BIGRAMS =
All neighboring key pairs on a QWERTY keyboard, except “er” and “re” which each make up >1% of bigrams in our “good” sample, plus each letter repeated or with a space
( ("abcdefghijklmnopqrstuvwxyz".chars.flat_map { |l| ["#{l} ", "#{l}#{l}"] }) + %w( qw we rt ty yu ui op as sd df fg gh hj jk kl zx xd cv vb bn nm qa az ws sx ed dc rf fv tg gb yh hn uj jm ik ol ) ).flat_map { |bigram| [bigram, bigram.reverse] }.to_set.freeze
- VERSION =
"0.6.0"
Instance Method Summary collapse
-
#bigram_similarity_to_corpus(string) ⇒ Object
Cosine similarity between vector of frequencies of bigrams within string, and vector of frequencies of all bigrams within corpus.
-
#bigram_similarity_to_mashing(string) ⇒ Object
Cosine similarity between vector of frequencies of bigrams within string, and vector which assumes all bigrams made of neighboring pairs on the keyboard are equally likely, and no others appear.
- #bigrams(string) ⇒ Object
- #is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: []) ⇒ Boolean
- #normalize_for_comparison(string) ⇒ Object
-
#probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1) ⇒ Object
The Bayesian probability of a string being keyboard mashing, given the probability of each bigram if drawn either from the legit corpus or from mashing, and an a priori probability of mashing.
Instance Method Details
#bigram_similarity_to_corpus(string) ⇒ Object
Cosine similarity between vector of frequencies of bigrams within string, and vector of frequencies of all bigrams within corpus
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/dejunk.rb', line 76 def bigram_similarity_to_corpus(string) bigrams = bigrams(string) freqs = bigrams. each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }. each_with_object({}) do |(bigram,count), freqs| freqs[bigram] = count.to_f / bigrams.length end numerator = freqs. map{ |bigram, freq| corpus_bigram_frequencies[bigram].to_f * freq }.inject(&:+) denominator = corpus_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5) numerator / denominator end |
#bigram_similarity_to_mashing(string) ⇒ Object
Cosine similarity between vector of frequencies of bigrams within string, and vector which assumes all bigrams made of neighboring pairs on the keyboard are equally likely, and no others appear
95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/dejunk.rb', line 95 def bigram_similarity_to_mashing(string) bigrams = bigrams(string) freqs = bigrams. each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }. each_with_object({}) do |(bigram,count), freqs| freqs[bigram] = count.to_f / bigrams.length end numerator = freqs.map{ |bigram, freq| freq * mashing_bigram_frequencies[bigram].to_f }.inject(&:+) denominator = mashing_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5) numerator / denominator end |
#bigrams(string) ⇒ Object
110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/dejunk.rb', line 110 def bigrams(string) return [] if string.nil? string = string.strip return [] if string.length < 2 string. chars. zip(string.chars[1..-1]). map { |c1,c2| "#{c1.downcase}#{c2.downcase}" if c1 && c2 }. compact. map { |bigram| bigram.gsub(/[0-9]/, '0'.freeze) }. map { |bigram| bigram.gsub(/[[:space:]]/, ' '.freeze) } end |
#is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: []) ⇒ Boolean
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/dejunk.rb', line 22 def is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: []) if string && (whitelist_strings.include?(string) || whitelist_regexes.any? { |re| string =~ re }) return false end return :no_alpha if string.nil? || string !~ /[[:alpha:]]/ normed = normalize_for_comparison(string) return :too_short if too_few_alphanumeric_chars?(normed, min_alnum_chars) return :one_char_repeat if excessive_single_character_repeats?(string, normed) return :starts_with_punct if starts_with_disallowed_punctuation?(string) return :too_many_short_words if too_many_short_words?(string) return :three_chars_repeat_twice if three_plus_chars_repeat_twice?(string) return :fuck if string =~ /\bfuck/i return :missing_vowels if missing_vowels?(string, normed) return :asdf_row if asdf_row_and_suspicious?(string) ascii_proportion = string.chars.count { |c| c.ord < 128 }.to_f / string.length # The bigrams look like the ones you'd get from keyboard mashing # (the probability shouldn't be taken too literally, > 0.25 is almost all # mashing in practice on our corpus) if string.length > 1 && ascii_proportion > 0.8 if probability_of_keyboard_mashing(string) > 0.25 return :mashing_bigrams end end # The bigrams don't look like the bigrams in legitimate strings if string.length > 6 && ascii_proportion > 0.8 corpus_similarity = bigram_similarity_to_corpus(string) # The similarity is more accurate for longer strings, and with more ASCII, # so increase the value (= lower the threshold) for shorter strings and # strings with less ASCII. score = corpus_similarity * (1.0/ascii_proportion**2) * (1.0/(1 - Math.exp(-0.1*string.length))) if score < 0.03 return :unlikely_bigrams elsif score < 0.08 && string !~ /\A([[:upper:]][[:lower:]]+ )*[[:upper:]][[:lower:]]+\z/ # The similarity ignores casing, so instead use a higher threshold if # the casing looks wrong return :unlikely_bigrams elsif score < bigram_similarity_to_mashing(string) return :mashing_bigrams end end false end |
#normalize_for_comparison(string) ⇒ Object
149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/dejunk.rb', line 149 def normalize_for_comparison(string) # This mirrors what mb_chars did, assuming that non-UTF-8 encoded strings # are actually UTF-8 in disguise. It's unclear whether this is necessary, # but we left it in to avoid having to figure this out. string = string.dup.force_encoding(Encoding::UTF_8) if string.encoding != Encoding::UTF_8 string. unicode_normalize(:nfkd). gsub(/\p{Mn}+/, ''.freeze). gsub(/[^[:alnum:]]+/, ''.freeze). downcase end |
#probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1) ⇒ Object
The Bayesian probability of a string being keyboard mashing, given the probability of each bigram if drawn either from the legit corpus or from mashing, and an a priori probability of mashing.
The probability shouldn’t be taken too literally, but it’s a useful indicator.
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/dejunk.rb', line 131 def probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1) bigrams = bigrams(string) return 0 unless bigrams.present? prob_bigrams_given_mashing = bigrams. map { |bigram| BigDecimal(mashing_probability(bigram).to_s) }. inject(&:*) prob_bigrams_given_corpus = bigrams. map { |bigram| BigDecimal(corpus_probability(bigram).to_s) }. inject(&:*) numerator = prob_bigrams_given_mashing * apriori_probability_of_mashing numerator / (numerator + prob_bigrams_given_corpus * (1 - apriori_probability_of_mashing)) end |