Module: Dejunk

Extended by:
Dejunk
Included in:
Dejunk
Defined in:
lib/dejunk.rb,
lib/dejunk/version.rb

Constant Summary collapse

MASH_CHARS =

All characters on the middle row of a QWERTY keyboard

'ASDFGHJKLasdfghjkl;: '
MASH_BIGRAMS =

All neighboring key pairs on a QWERTY keyboard, except “er” and “re” which each make up >1% of bigrams in our “good” sample, plus each letter repeated or with a space

(
  ("abcdefghijklmnopqrstuvwxyz".chars.flat_map { |l| ["#{l} ", "#{l}#{l}"] }) +
  %w( qw we rt ty yu ui op as sd df fg gh hj jk kl zx xd cv vb bn nm qa az ws sx ed dc rf fv tg gb yh hn uj jm ik ol )
).flat_map { |bigram| [bigram, bigram.reverse] }.to_set.freeze
VERSION =
"0.6.0"

Instance Method Summary collapse

Instance Method Details

#bigram_similarity_to_corpus(string) ⇒ Object

Cosine similarity between vector of frequencies of bigrams within string, and vector of frequencies of all bigrams within corpus



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/dejunk.rb', line 76

def bigram_similarity_to_corpus(string)
  bigrams = bigrams(string)

  freqs = bigrams.
    each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
    each_with_object({}) do |(bigram,count), freqs|
      freqs[bigram] = count.to_f / bigrams.length
    end

  numerator = freqs.
    map{ |bigram, freq| corpus_bigram_frequencies[bigram].to_f * freq }.inject(&:+)
  denominator = corpus_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)

  numerator / denominator
end

#bigram_similarity_to_mashing(string) ⇒ Object

Cosine similarity between vector of frequencies of bigrams within string, and vector which assumes all bigrams made of neighboring pairs on the keyboard are equally likely, and no others appear



95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/dejunk.rb', line 95

def bigram_similarity_to_mashing(string)
  bigrams = bigrams(string)

  freqs = bigrams.
    each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
    each_with_object({}) do |(bigram,count), freqs|
      freqs[bigram] = count.to_f / bigrams.length
    end

  numerator = freqs.map{ |bigram, freq| freq * mashing_bigram_frequencies[bigram].to_f }.inject(&:+)
  denominator = mashing_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)

  numerator / denominator
end

#bigrams(string) ⇒ Object



110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/dejunk.rb', line 110

def bigrams(string)
  return [] if string.nil?

  string = string.strip
  return [] if string.length < 2

  string.
    chars.
    zip(string.chars[1..-1]).
    map { |c1,c2| "#{c1.downcase}#{c2.downcase}" if c1 && c2 }.
    compact.
    map { |bigram| bigram.gsub(/[0-9]/, '0'.freeze) }.
    map { |bigram| bigram.gsub(/[[:space:]]/, ' '.freeze) }
end

#is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: []) ⇒ Boolean

Returns:

  • (Boolean)


22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/dejunk.rb', line 22

def is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: [])
  if string && (whitelist_strings.include?(string) || whitelist_regexes.any? { |re| string =~ re })
    return false
  end

  return :no_alpha if string.nil? || string !~ /[[:alpha:]]/

  normed = normalize_for_comparison(string)

  return :too_short if too_few_alphanumeric_chars?(normed, min_alnum_chars)
  return :one_char_repeat if excessive_single_character_repeats?(string, normed)
  return :starts_with_punct if starts_with_disallowed_punctuation?(string)
  return :too_many_short_words if too_many_short_words?(string)
  return :three_chars_repeat_twice if three_plus_chars_repeat_twice?(string)
  return :fuck if string =~ /\bfuck/i
  return :missing_vowels if missing_vowels?(string, normed)
  return :asdf_row if asdf_row_and_suspicious?(string)

  ascii_proportion = string.chars.count { |c| c.ord < 128 }.to_f / string.length

  # The bigrams look like the ones you'd get from keyboard mashing
  # (the probability shouldn't be taken too literally, > 0.25 is almost all
  # mashing in practice on our corpus)
  if string.length > 1 && ascii_proportion > 0.8
    if probability_of_keyboard_mashing(string) > 0.25
      return :mashing_bigrams
    end
  end

  # The bigrams don't look like the bigrams in legitimate strings
  if string.length > 6 && ascii_proportion > 0.8
    corpus_similarity = bigram_similarity_to_corpus(string)

    # The similarity is more accurate for longer strings, and with more ASCII,
    # so increase the value (= lower the threshold) for shorter strings and
    # strings with less ASCII.
    score = corpus_similarity * (1.0/ascii_proportion**2) * (1.0/(1 - Math.exp(-0.1*string.length)))

    if score < 0.03
      return :unlikely_bigrams
    elsif score < 0.08 && string !~ /\A([[:upper:]][[:lower:]]+ )*[[:upper:]][[:lower:]]+\z/
      # The similarity ignores casing, so instead use a higher threshold if
      # the casing looks wrong
      return :unlikely_bigrams
    elsif score < bigram_similarity_to_mashing(string)
      return :mashing_bigrams
    end
  end

  false
end

#normalize_for_comparison(string) ⇒ Object



149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/dejunk.rb', line 149

def normalize_for_comparison(string)
  # This mirrors what mb_chars did, assuming that non-UTF-8 encoded strings
  # are actually UTF-8 in disguise. It's unclear whether this is necessary,
  # but we left it in to avoid having to figure this out.
  string = string.dup.force_encoding(Encoding::UTF_8) if string.encoding != Encoding::UTF_8

  string.
    unicode_normalize(:nfkd).
    gsub(/\p{Mn}+/, ''.freeze).
    gsub(/[^[:alnum:]]+/, ''.freeze).
    downcase
end

#probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1) ⇒ Object

The Bayesian probability of a string being keyboard mashing, given the probability of each bigram if drawn either from the legit corpus or from mashing, and an a priori probability of mashing.

The probability shouldn’t be taken too literally, but it’s a useful indicator.



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/dejunk.rb', line 131

def probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1)
  bigrams = bigrams(string)

  return 0 unless bigrams.present?

  prob_bigrams_given_mashing = bigrams.
    map { |bigram| BigDecimal(mashing_probability(bigram).to_s) }.
    inject(&:*)

  prob_bigrams_given_corpus = bigrams.
    map { |bigram| BigDecimal(corpus_probability(bigram).to_s) }.
    inject(&:*)

  numerator = prob_bigrams_given_mashing * apriori_probability_of_mashing

  numerator / (numerator + prob_bigrams_given_corpus * (1 - apriori_probability_of_mashing))
end