Module: Okmain::KMeans

Defined in:
lib/okmain/kmeans.rb

Constant Summary collapse

MAX_CENTROIDS =
4
LLOYDS_MAX_ITERATIONS =
300
LLOYDS_CONVERGENCE_TOL =
1e-3
SIMILAR_CLUSTER_DISTANCE_SQ =
0.005
KMEANSPP_CANDIDATES =
3

Class Method Summary collapse

Class Method Details

.cluster(pixels, k: MAX_CENTROIDS) ⇒ Object

Returns [centroids, assignments] where centroids is Array of [L, a, b] and assignments is Array of centroid indices per pixel.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/okmain/kmeans.rb', line 15

def cluster(pixels, k: MAX_CENTROIDS)
  rng = Random.new(42)
  n = pixels.size
  return [pixels.dup, Array.new(n) { |i| i }] if n <= k

  loop do
    centroids = init_plusplus(pixels, k, rng)
    assignments = Array.new(n, 0)

    centroids, assignments = lloyds(pixels, centroids, assignments, k, rng)

    # Adaptive reduction: merge similar centroids
    merged = false
    i = 0
    while i < k - 1 && !merged
      j = i + 1
      while j < k && !merged
        if distance_sq(centroids[i], centroids[j]) < SIMILAR_CLUSTER_DISTANCE_SQ
          k -= 1
          merged = true
        end
        j += 1
      end
      i += 1
    end

    return [centroids, assignments] unless merged
  end
end

.distance_sq(a, b) ⇒ Object



169
170
171
172
173
174
# File 'lib/okmain/kmeans.rb', line 169

def distance_sq(a, b)
  dl = a[0] - b[0]
  da = a[1] - b[1]
  db = a[2] - b[2]
  dl * dl + da * da + db * db
end

.init_plusplus(pixels, k, rng) ⇒ Object

K-means++ initialization with 3 candidates per step



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/okmain/kmeans.rb', line 46

def init_plusplus(pixels, k, rng)
  n = pixels.size
  centroids = [pixels[rng.rand(n)].dup]

  dist_sq = Array.new(n, Float::INFINITY)

  (1...k).each do
    # Update distances to nearest centroid
    last = centroids.last
    i = 0
    while i < n
      d = distance_sq(pixels[i], last)
      dist_sq[i] = d if d < dist_sq[i]
      i += 1
    end

    total = dist_sq.sum
    best_candidate = nil
    best_potential = Float::INFINITY

    KMEANSPP_CANDIDATES.times do
      # Weighted random selection
      r = rng.rand * total
      cumulative = 0.0
      idx = 0
      while idx < n
        cumulative += dist_sq[idx]
        if cumulative >= r
          break
        end
        idx += 1
      end
      idx = n - 1 if idx >= n

      # Compute potential for this candidate
      candidate = pixels[idx]
      potential = 0.0
      i = 0
      while i < n
        d = distance_sq(pixels[i], candidate)
        potential += d < dist_sq[i] ? d : dist_sq[i]
        i += 1
      end

      if potential < best_potential
        best_potential = potential
        best_candidate = candidate
      end
    end

    centroids << best_candidate.dup
  end

  centroids
end

.lloyds(pixels, centroids, assignments, k, rng) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/okmain/kmeans.rb', line 102

def lloyds(pixels, centroids, assignments, k, rng)
  n = pixels.size

  LLOYDS_MAX_ITERATIONS.times do
    # Assignment step
    i = 0
    while i < n
      px = pixels[i]
      best = 0
      best_d = distance_sq(px, centroids[0])
      c = 1
      while c < k
        d = distance_sq(px, centroids[c])
        if d < best_d
          best_d = d
          best = c
        end
        c += 1
      end
      assignments[i] = best
      i += 1
    end

    # Update step
    new_centroids = Array.new(k) { [0.0, 0.0, 0.0] }
    counts = Array.new(k, 0)

    i = 0
    while i < n
      c = assignments[i]
      px = pixels[i]
      nc = new_centroids[c]
      nc[0] += px[0]
      nc[1] += px[1]
      nc[2] += px[2]
      counts[c] += 1
      i += 1
    end

    shift_sq = 0.0
    c = 0
    while c < k
      if counts[c] == 0
        # Reassign empty cluster to random data point
        ri = rng.rand(n)
        new_centroids[c] = pixels[ri].dup
      else
        inv = 1.0 / counts[c]
        nc = new_centroids[c]
        nc[0] *= inv
        nc[1] *= inv
        nc[2] *= inv
      end
      dl = new_centroids[c][0] - centroids[c][0]
      da = new_centroids[c][1] - centroids[c][1]
      db = new_centroids[c][2] - centroids[c][2]
      shift_sq += dl * dl + da * da + db * db
      c += 1
    end

    centroids = new_centroids
    break if shift_sq < LLOYDS_CONVERGENCE_TOL
  end

  [centroids, assignments]
end