Class: Disco::Recommender

Inherits:
Object
  • Object
show all
Defined in:
lib/disco/recommender.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(factors: 8, epochs: 20, verbose: nil, top_items: false) ⇒ Recommender

Returns a new instance of Recommender.



5
6
7
8
9
10
11
12
# File 'lib/disco/recommender.rb', line 5

def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false)
  @factors = factors
  @epochs = epochs
  @verbose = verbose
  @user_map = {}
  @item_map = {}
  @top_items = top_items
end

Instance Attribute Details

#global_meanObject (readonly)

Returns the value of attribute global_mean.



3
4
5
# File 'lib/disco/recommender.rb', line 3

def global_mean
  @global_mean
end

Class Method Details

.load_json(json) ⇒ Object



293
294
295
296
297
298
299
300
301
# File 'lib/disco/recommender.rb', line 293

def self.load_json(json)
  require "json"

  obj = JSON.parse(json)

  recommender = new
  recommender.send(:json_load, obj)
  recommender
end

Instance Method Details

#fit(train_set, validation_set: nil) ⇒ Object

Raises:

  • (ArgumentError)


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/disco/recommender.rb', line 14

def fit(train_set, validation_set: nil)
  train_set = to_dataset(train_set)
  validation_set = to_dataset(validation_set) if validation_set

  check_training_set(train_set)

  # TODO option to set in initializer to avoid pass
  # could also just check first few values
  # but may be confusing if they are all missing and later ones aren't
  @implicit = !train_set.any? { |v| v[:rating] }

  if @implicit && train_set.any? { |v| v[:value] }
    raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used."
  end

  # TODO improve performance
  # (catch exception instead of checking ahead of time)
  unless @implicit
    check_ratings(train_set)

    if validation_set
      check_ratings(validation_set)
    end
  end

  @user_map = {}
  @item_map = {}
  @rated = Hash.new { |hash, key| hash[key] = {} }
  input = []
  train_set.each do |v|
    # update maps and build matrix in single pass
    u = (@user_map[v[:user_id]] ||= @user_map.size)
    i = (@item_map[v[:item_id]] ||= @item_map.size)
    @rated[u][i] = true

    # explicit will always have a value due to check_ratings
    input << [u, i, @implicit ? 1 : v[:rating]]
  end
  @rated.default = nil

  # much more efficient than checking every value in another pass
  raise ArgumentError, "Missing user_id" if @user_map.key?(nil)
  raise ArgumentError, "Missing item_id" if @item_map.key?(nil)

  # TODO improve performance
  unless @implicit
    @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] }
  else
    @min_rating = nil
    @max_rating = nil
  end

  if @top_items
    @item_count = Array.new(@item_map.size, 0)
    @item_sum = Array.new(@item_map.size, 0.0)
    train_set.each do |v|
      i = @item_map[v[:item_id]]
      @item_count[i] += 1
      @item_sum[i] += (@implicit ? 1 : v[:rating])
    end
  end

  eval_set = nil
  if validation_set
    eval_set = []
    validation_set.each do |v|
      u = @user_map[v[:user_id]]
      i = @item_map[v[:item_id]]

      # set to non-existent item
      u ||= -1
      i ||= -1

      eval_set << [u, i, @implicit ? 1 : v[:rating]]
    end
  end

  loss = @implicit ? 12 : 0
  verbose = @verbose
  verbose = true if verbose.nil? && eval_set
  model = Libmf::Model.new(loss: loss, factors: @factors, iterations: @epochs, quiet: !verbose)
  model.fit(input, eval_set: eval_set)

  @global_mean = model.bias

  @user_factors = model.p_factors(format: :numo)
  @item_factors = model.q_factors(format: :numo)

  @user_norms = nil
  @item_norms = nil

  @user_recs_index = nil
  @similar_users_index = nil
  @similar_items_index = nil
end

#inspectObject



259
260
261
# File 'lib/disco/recommender.rb', line 259

def inspect
  to_s # for now
end

#item_factors(item_id = nil) ⇒ Object



234
235
236
237
238
239
240
241
# File 'lib/disco/recommender.rb', line 234

def item_factors(item_id = nil)
  if item_id
    i = @item_map[item_id]
    @item_factors[i, true] if i
  else
    @item_factors
  end
end

#item_idsObject



221
222
223
# File 'lib/disco/recommender.rb', line 221

def item_ids
  @item_map.keys
end

#optimize_similar_items(library: nil) ⇒ Object Also known as: optimize_item_recs



248
249
250
251
# File 'lib/disco/recommender.rb', line 248

def optimize_similar_items(library: nil)
  check_fit
  @similar_items_index = create_index(@item_factors / item_norms.expand_dims(1), library: library)
end

#optimize_similar_users(library: nil) ⇒ Object



254
255
256
257
# File 'lib/disco/recommender.rb', line 254

def optimize_similar_users(library: nil)
  check_fit
  @similar_users_index = create_index(@user_factors / user_norms.expand_dims(1), library: library)
end

#optimize_user_recsObject



243
244
245
246
# File 'lib/disco/recommender.rb', line 243

def optimize_user_recs
  check_fit
  @user_recs_index = create_index(item_factors, library: "faiss")
end

#predict(data) ⇒ Object

generates a prediction even if a user has already rated the item



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/disco/recommender.rb', line 111

def predict(data)
  data = to_dataset(data)

  u = data.map { |v| @user_map[v[:user_id]] }
  i = data.map { |v| @item_map[v[:item_id]] }

  new_index = data.each_index.select { |index| u[index].nil? || i[index].nil? }
  new_index.each do |j|
    u[j] = 0
    i[j] = 0
  end

  predictions = @user_factors[u, true].inner(@item_factors[i, true])
  predictions.inplace.clip(@min_rating, @max_rating) if @min_rating
  predictions[new_index] = @global_mean
  predictions.to_a
end

#similar_items(item_id, count: 5) ⇒ Object Also known as: item_recs



173
174
175
176
# File 'lib/disco/recommender.rb', line 173

def similar_items(item_id, count: 5)
  check_fit
  similar(item_id, :item_id, @item_map, @item_factors, item_norms, count, @similar_items_index)
end

#similar_users(user_id, count: 5) ⇒ Object



179
180
181
182
# File 'lib/disco/recommender.rb', line 179

def similar_users(user_id, count: 5)
  check_fit
  similar(user_id, :user_id, @user_map, @user_factors, user_norms, count, @similar_users_index)
end

#to_jsonObject



263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/disco/recommender.rb', line 263

def to_json
  require "base64"
  require "json"

  obj = {
    implicit: @implicit,
    user_ids: @user_map.keys,
    item_ids: @item_map.keys,
    rated: @user_map.map { |_, u| (@rated[u] || {}).keys },
    global_mean: @global_mean,
    user_factors: Base64.strict_encode64(@user_factors.to_binary),
    item_factors: Base64.strict_encode64(@item_factors.to_binary),
    factors: @factors,
    epochs: @epochs,
    verbose: @verbose
  }

  unless @implicit
    obj[:min_rating] = @min_rating
    obj[:max_rating] = @max_rating
  end

  if @top_items
    obj[:item_count] = @item_count
    obj[:item_sum] = @item_sum
  end

  JSON.generate(obj)
end

#top_items(count: 5) ⇒ Object



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/disco/recommender.rb', line 184

def top_items(count: 5)
  check_fit
  raise "top_items not computed" unless @top_items

  if @implicit
    scores = Numo::UInt64.cast(@item_count)
  else
    min_rating = @min_rating

    # TODO remove temp fix
    min_rating -= 1 if @min_rating == @max_rating

    # wilson score with continuity correction
    # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction
    z = 1.96 # 95% confidence
    range = @max_rating - @min_rating
    n = Numo::DFloat.cast(@item_count)
    phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n
    phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction
    scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n)
    scores = scores * range + min_rating
  end

  indexes = scores.sort_index.reverse
  indexes = indexes[0...[count, indexes.size].min] if count
  scores = scores[indexes]

  keys = @item_map.keys
  indexes.size.times.map do |i|
    {item_id: keys[indexes[i]], score: scores[i]}
  end
end

#user_factors(user_id = nil) ⇒ Object



225
226
227
228
229
230
231
232
# File 'lib/disco/recommender.rb', line 225

def user_factors(user_id = nil)
  if user_id
    u = @user_map[user_id]
    @user_factors[u, true] if u
  else
    @user_factors
  end
end

#user_idsObject



217
218
219
# File 'lib/disco/recommender.rb', line 217

def user_ids
  @user_map.keys
end

#user_recs(user_id, count: 5, item_ids: nil) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/disco/recommender.rb', line 129

def user_recs(user_id, count: 5, item_ids: nil)
  check_fit
  u = @user_map[user_id]

  if u
    rated = item_ids ? {} : @rated[u]

    if item_ids
      ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact)
      return [] if ids.size == 0

      predictions = @item_factors[ids, true].inner(@user_factors[u, true])
      indexes = predictions.sort_index.reverse
      indexes = indexes[0...[count + rated.size, indexes.size].min] if count
      predictions = predictions[indexes]
      ids = ids[indexes]
    elsif @user_recs_index && count
      predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] }
    else
      predictions = @item_factors.inner(@user_factors[u, true])
      indexes = predictions.sort_index.reverse # reverse just creates view
      indexes = indexes[0...[count + rated.size, indexes.size].min] if count
      predictions = predictions[indexes]
      ids = indexes
    end

    predictions.inplace.clip(@min_rating, @max_rating) if @min_rating

    keys = @item_map.keys
    result = []
    ids.each_with_index do |item_id, i|
      next if rated[item_id]

      result << {item_id: keys[item_id], score: predictions[i]}
      break if result.size == count
    end
    result
  elsif @top_items
    top_items(count: count)
  else
    []
  end
end