reviews2

This assignment was part of Adaptive Systems course at UPM where I had to create a database of ratings for 20 movies by 50 users and incorporate Collaborative Filtering (25% and 70% empty cells for ratings) using Pearson Correlation and analysing the data to understand the nature of Recommender System and Sparsity problem of ratings.

Dataset Excel Sheet
                        
# IMPORTING PACKAGES

import pandas as pd
import math
import numpy as np


# READ EXCEL
df_ratings = pd.read_excel('Data set.xlsx',sheet_name='Sparcity rating (75%)')

#CONVERT TO FLOAT (, replaced with .)
def convert_to_float(x):
    float_value = str(x).replace(',','.')
    return float(float_value)

for col in df_ratings.columns:
    df_ratings[col] = df_ratings[col].apply(convert_to_float)


#CORRELATION PEARSON INBUILT
corr_matrix = df_ratings.corr(method='pearson')
corr_matrix.to_excel('correlation_python_inbuilt.xlsx')



# PEARSON CORRELATION USING FORMULA

def average(x):
    assert len(x) > 0
    return float(sum(x)) / len(x)

def pearson_def(x, y):
    assert len(x) == len(y)
    n = len(x)
    if (n > 0):
        avg_x = average(x)
        avg_y = average(y)
        diffprod = 0
        xdiff2 = 0
        ydiff2 = 0
        for idx in range(n):
            xdiff = x[idx] - avg_x
            ydiff = y[idx] - avg_y
            diffprod += xdiff * ydiff
            xdiff2 += xdiff * xdiff
            ydiff2 += ydiff * ydiff

        return diffprod / math.sqrt(xdiff2 * ydiff2)
    else: 
        return None

# In[105]:


correlation_matrix = []
for user1 in df_ratings.columns:
    for user2 in df_ratings.columns:
        a = df_ratings[user1].values
        b = df_ratings[user2].values
        bad = ~np.logical_or(np.isnan(a), np.isnan(b))
        a = np.compress(bad, a)
        b = np.compress(bad, b)
        corr_user = pearson_def(a, b)
        user_correlation = (user1,user2,corr_user)
        correlation_matrix.append(user_correlation)


# In[108]:


df_corr = pd.DataFrame(correlation_matrix)
df_corr = df_corr.pivot(index=0, columns=1, values=2)
df_corr.to_excel('correlation_data_sparse.xlsx')