In [1]:
import numpy as np
import pandas as pd

# Load rating data
data = pd.read_csv('steam-200k.csv', header=None, index_col=None, 
                   names=['UserID', 'Game', 'Action', 'Hours', 'Other'])
print(data.head())

# Delete rows which represent only purchase actions
data = data[data['Action'] == 'play']

# Delete users who have played less than 5 games
data = data.groupby('UserID').filter(lambda x: len(x) >= 5)

print(data.head())
print(data.shape)

      UserID                        Game    Action  Hours  Other
0  151603712  The Elder Scrolls V Skyrim  purchase    1.0      0
1  151603712  The Elder Scrolls V Skyrim      play  273.0      0
2  151603712                   Fallout 4  purchase    1.0      0
3  151603712                   Fallout 4      play   87.0      0
4  151603712                       Spore  purchase    1.0      0
      UserID                        Game Action  Hours  Other
1  151603712  The Elder Scrolls V Skyrim   play  273.0      0
3  151603712                   Fallout 4   play   87.0      0
5  151603712                       Spore   play   14.9      0
7  151603712           Fallout New Vegas   play   12.1      0
9  151603712               Left 4 Dead 2   play    8.9      0
(57789, 5)


In [2]:
# Create a rating matrix with ratings=hoursPlayed
rating_data = data.pivot_table(index='UserID',columns='Game', values='Hours', fill_value=0)
print(rating_data.shape)

(2436, 3544)


In [3]:
from numpy.linalg import svd

rating_matrix = rating_data.as_matrix()
U, S, V = svd(rating_matrix)
print(U.shape)
print(S.shape)
print(V.shape)

(2436, 2436)
(2436,)
(3544, 3544)


In [4]:
from scipy.spatial.distance import cdist

def get_topk_similar_games(game_name, n_components, topk):
    reduced_game_matrix = V.T[:, :n_components]
    game_idx = rating_data.columns.get_loc(game_name)
    reduced_game_vector = reduced_game_matrix[game_idx, None, :]
    distances = cdist(reduced_game_vector, reduced_game_matrix, metric='cosine')[0]
    return rating_data.columns[np.argsort(distances)[:topk]]

game_name = 'Fallout 3'
print(get_topk_similar_games(game_name, n_components=50, topk=20))

Index(['Fallout 3', 'Influent', 'Lili Child of Geos', 'Hate Plus',
       'ArtRage Studio Pro', 'LEGO MARVEL Super Heroes', 'Fallout New Vegas',
       'BioShock', 'Assassin's Creed',
       'Dysfunctional Systems Learning to Manage Chaos', 'FINAL FANTASY VII',
       'Fallout 3 - Game of the Year Edition', 'Spoiler Alert',
       'Commander Keen Complete Pack', 'Tiny and Big Grandpa's Leftovers',
       'MURI', 'Monopoly', 'The Secret of Monkey Island Special Edition',
       'Shadowrun Dragonfall - Director's Cut', 'Jagged Alliance Crossfire'],
      dtype='object', name='Game')
