3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
# File 'lib/cmfrec/data.rb', line 3
def load_movielens
require "csv"
data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490")
user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user",
file_hash: "f120e114da2e8cf314fd28f99417c94ae9ddf1cb6db8ce0e4b5995d40e90e62c")
item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701")
movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
user_info = []
CSV.foreach(user_path, col_sep: "|") do |row|
user = {user_id: row[0].to_i}
10.times do |i|
user[:"region#{i}"] = row[4][0] == i.to_s ? 1 : 0
end
user_info << user
end
item_info = []
movies = {}
movie_names = {}
genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western)
CSV.parse(movies_str, col_sep: "|", converters: [:numeric]) do |row|
movies[row[0]] = row[1]
next if movie_names[row[1]]
movie_names[row[1]] = true
item = {item_id: row[1], year: row[2] ? Date.parse(row[2]).year : 1970}
genres.each_with_index do |genre, i|
item[:"genre_#{genre}"] = row[i + 5]
end
item_info << item
end
data = []
CSV.foreach(data_path, col_sep: "\t", converters: [:numeric]) do |row|
data << {
user_id: row[0],
item_id: movies[row[1]],
rating: row[2]
}
end
[data, user_info, item_info]
end
|