# A dictionary of movie critics and their ratings of a small
# set of movies
critics={‘Lisa Rose‘: {‘Lady in the Water‘: 2.5, ‘Snakes on a Plane‘: 3.5,
‘Just My Luck‘: 3.0, ‘Superman Returns‘: 3.5, ‘You, Me and Dupree‘: 2.5,
‘The Night Listener‘: 3.0},
‘Gene Seymour‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 3.5,
‘Just My Luck‘: 1.5, ‘Superman Returns‘: 5.0, ‘The Night Listener‘: 3.0,
‘You, Me and Dupree‘: 3.5},
‘Michael Phillips‘: {‘Lady in the Water‘: 2.5, ‘Snakes on a Plane‘: 3.0,
‘Superman Returns‘: 3.5, ‘The Night Listener‘: 4.0},
‘Claudia Puig‘: {‘Snakes on a Plane‘: 3.5, ‘Just My Luck‘: 3.0,
‘The Night Listener‘: 4.5, ‘Superman Returns‘: 4.0,
‘You, Me and Dupree‘: 2.5},
‘Mick LaSalle‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 4.0,
‘Just My Luck‘: 2.0, ‘Superman Returns‘: 3.0, ‘The Night Listener‘: 3.0,
‘You, Me and Dupree‘: 2.0},
‘Jack Matthews‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 4.0,
‘The Night Listener‘: 3.0, ‘Superman Returns‘: 5.0, ‘You, Me and Dupree‘: 3.5},
‘Toby‘: {‘Snakes on a Plane‘:4.5,‘You, Me and Dupree‘:1.0,‘Superman Returns‘:4.0}}
from math import sqrt
from pandas import DataFrame
import numpy as np
import operator
import scipy.stats.stats as scss
import pandas as pd
#
#import pydelicious
def sim_distance(prefs, person1, person2):
#prefs = DataFrame(prefs)
sub = prefs[person1] - prefs[person2]
commonCount = len(sub) - len(sub[np.isnan(sub)])
if commonCount == 0:
#print(‘%s and %s has nothing in common.‘%(person1,person2))
return 0
sim = 1/ (1+sqrt(sum(sub[~np.isnan(sub)]**2)))
return sim
def sim_pearson(prefs, person1, person2):
#prefs = DataFrame(prefs)
pref = prefs[[person1,person2]]
pref = pref.dropna(axis=0, how = ‘any‘)
sub = prefs[person1] - prefs[person2]
commonCount = len(sub) - len(sub[np.isnan(sub)])
if commonCount == 0:
#print(‘%s and %s has nothing in common.‘%(person1,person2))
return 0
sim = scss.pearsonr(pref[person1], pref[person2])[0]
return sim
def topMatches(prefs, person, n=3, similarity = sim_distance):
#prefs = DataFrame(prefs)
sims = {}
for other in prefs.columns:
if other == person: continue
sim = similarity(prefs, person, other)
sims[other] = sim
sortedSims = sorted(sims.items(), key = operator.itemgetter(1), reverse = True)
return sortedSims[:n]
def getRecommendations(prefs, person, similarity = sim_pearson):
prefs = DataFrame(prefs).T
totalSums = {}
simSums = {}
for other in prefs.columns:
if other == person: continue
sim = similarity(prefs, other, person)
if sim <= 0: continue
for item in prefs[other].index:
if np.isnan(prefs[person][item]) or prefs[person][item] == 0:
if np.isnan(prefs[other][item] * sim): continue
totalSums.setdefault(item,0)
totalSums[item] += prefs[other][item] * sim
simSums.setdefault(item,0)
simSums[item] += sim
rankings = [(total/simSums[item], item) for item, total in totalSums.items()]
rankings.sort()
rankings.reverse()
return rankings
def caculateSimilarItems(prefs, n=10):
simularItems = {}
for item in prefs.columns:
topMatch = topMatches(prefs, item, n = n, similarity = sim_distance)
simularItems[item] = topMatch
return simularItems
def getRecommandationItems(prefs, simularItems, person):
totals = {}
sumSims = {}
pref = prefs.loc[person, :]
alreadyWatched = pref[~pref.isna()]
for i in alreadyWatched.index: #所有當前用戶看過的電影
if i not in simularItems.keys(): continue #其他人都沒看過的電影
for item, sim in simularItems[i]:
if item in alreadyWatched.index: continue
totals.setdefault(item,0)
sumSims.setdefault(item,0)
totals[item] += alreadyWatched[i] * sim
sumSims[item] += sim
ranking = [(score/ sumSims[item],item) for item,score in totals.items()]
ranking.sort()
ranking.reverse()
return ranking
if __name__ == "__main__":
homePath = ‘P:\Department\Celer\個人資料夾\F3234506_麗娟\Extra\Python\资源\9780596529321-master\PCI_Code Folder\chapter2\DataSet\ml-latest-small‘
#homePath1 = os.path.dirname(os.path.abspath(‘__filen__‘))
#os.path.join()
with open(homePath + ‘\\ratings.csv‘,‘rb‘) as ratings_f:
ratings = pd.read_csv(ratings_f)
with open(homePath + ‘\\movies.csv‘, ‘rb‘) as movies_f:
movies = pd.read_csv(movies_f)
with open(homePath + ‘\\tags.csv‘, ‘rb‘) as tags_f:
tags = pd.read_csv(tags_f)
movies = movies.drop_duplicates([‘title‘])
data = pd.merge(ratings, movies, on = ‘movieId‘, how =‘outer‘) #.merge(tags, on = [‘movieId‘,‘userId‘])
data = data.pivot(index = ‘userId‘, columns = ‘title‘,values = ‘rating‘)
#ratings = ratings.pivot(index = ‘movieId‘, columns = ‘userId‘, values = ‘rating‘)
#ratings = ratings.pivot(index = ‘userId‘, columns = ‘movieId‘, values = ‘rating‘).loc[:,0:100]
similarity = caculateSimilarItems(data.iloc[:,0:500])
#print(similarity)
recommItems = getRecommandationItems(data, similarity, 6)
print(recommItems)
原文:https://www.cnblogs.com/Colleen-Blog/p/10865201.html