This assignment was part of Adaptive Systems course at UPM where I had to create a database of ratings for 20 movies by 50 users and incorporate Collaborative Filtering (25% and 70% empty cells for ratings) using Pearson Correlation and analysing the data to understand the nature of Recommender System and Sparsity problem of ratings.
Dataset Excel Sheet
# IMPORTING PACKAGES
import pandas as pd
import math
import numpy as np
# READ EXCEL
df_ratings = pd.read_excel('Data set.xlsx',sheet_name='Sparcity rating (75%)')
#CONVERT TO FLOAT (, replaced with .)
def convert_to_float(x):
float_value = str(x).replace(',','.')
return float(float_value)
for col in df_ratings.columns:
df_ratings[col] = df_ratings[col].apply(convert_to_float)
#CORRELATION PEARSON INBUILT
corr_matrix = df_ratings.corr(method='pearson')
corr_matrix.to_excel('correlation_python_inbuilt.xlsx')
# PEARSON CORRELATION USING FORMULA
def average(x):
assert len(x) > 0
return float(sum(x)) / len(x)
def pearson_def(x, y):
assert len(x) == len(y)
n = len(x)
if (n > 0):
avg_x = average(x)
avg_y = average(y)
diffprod = 0
xdiff2 = 0
ydiff2 = 0
for idx in range(n):
xdiff = x[idx] - avg_x
ydiff = y[idx] - avg_y
diffprod += xdiff * ydiff
xdiff2 += xdiff * xdiff
ydiff2 += ydiff * ydiff
return diffprod / math.sqrt(xdiff2 * ydiff2)
else:
return None
# In[105]:
correlation_matrix = []
for user1 in df_ratings.columns:
for user2 in df_ratings.columns:
a = df_ratings[user1].values
b = df_ratings[user2].values
bad = ~np.logical_or(np.isnan(a), np.isnan(b))
a = np.compress(bad, a)
b = np.compress(bad, b)
corr_user = pearson_def(a, b)
user_correlation = (user1,user2,corr_user)
correlation_matrix.append(user_correlation)
# In[108]:
df_corr = pd.DataFrame(correlation_matrix)
df_corr = df_corr.pivot(index=0, columns=1, values=2)
df_corr.to_excel('correlation_data_sparse.xlsx')