###################################################### # # R implementation of a CF+ calculation using pearson # correlation as weightin function; corresponding to # exercise 3.2 of # http://www.dbs.ifi.lmu.de/Lehre/MaschLernen/SS2008/index.html # # programming language R available at # http://cran.r-project.org/ # # available in the CIP pool by # $ R # # ###################################################### ###################################################### # # x - feature vector 1 # y - feature vector 2 # # Returns the pearson correlation on the set of all # features defined in both x _and_ y. If x or y # consist of a vector containing only a constant in # all places, returns NA (as the pearson correlation # is not defined in this case - replacing the result # by 0 is not sufficient!) # ###################################################### myCor<-function(x, y) { filt<-which(!is.na(x & y)) if (length(filt) == 0) return (0) else return (cor(x[filt],y[filt])) } ###################################################### # # m - data matrix: users (in rows) described by their # ratings of movies (in columns) # x - test instance; is ignored if s != 0 # i - index of feature to be rated in m # s - index of test instance in m # # Returns the CF+ Score for movie i in a test pattern # x. If s != 0, user s is taken as test pattern. # ###################################################### cfp<-function (m, x, i=0, s=0) { if (s == 0) stopifnot(dim(m)[2] - 1 == length(x)) if (i == 0) i<-dim(m)[2] # exclude feature i from matrix m if (i == 1) mtmp<-m[,2:dim(m)[2]] if (i == dim(m)[2]) mtmp<-m[,1:(i-1)] if (i != 1 & i != dim(m)[2]) mtmp<-m[,c(1:(i-1),(i+1):dim(m)[2])] tres<-0 w<-0 for (j in 1:(dim(m)[1])) { if (!is.na(m[j,i])) { if (s == 0) # external test vector tempDist <-myCor(x,mtmp[j,]) else { # use row as test vector if (s == j) # avoid self-training next tempDist <-myCor(mtmp[s,],mtmp[j,]) } if (is.na(tempDist)) next # skip instance with useless correlation # mean(instance j) is taken over complete dataset tres <- tres + tempDist * (m[j,i] - mean(m[j,])) w <- w + abs(tempDist) } } if (w == 0) return (NA) # mean(z) must be built without feature i return (mean(ifelse(s==0,x,mtmp[s,])) + 1/w * tres) } ###################################################### # # m - data matrix # o - ordering selector for the columns of m to be # plotted # ###################################################### plotData<-function(m, order=NULL) { N<-dim(m)[1] M<-dim(m)[2] if (is.null(order)) o<-1:M plot(1,1,"n",xlim=range(.5,M+.5),ylim=range(.5,5),main="CF example dataset",ylab="rating",xlab="movies",xaxt="n") axis(1, at=1:M, labels=videos[o]) cols=rainbow(N) for (i in 1:N) { lines(1:M, m[i,o], col=cols[i], lwd=3) points(1:M, m[i,o], col=cols[i], lwd=3, cex=2) } legend("bottomright",legend=paste("user",1:N),col=cols,lwd=3,cex=.75) } ###################################################### # # movie - index id of movie to be tested in m # (relative to ordering vector o) # user - id of user to be tested # v - verbose? # # One of movie and user must be != 0 # # Returns the Mean Square Error of the predictions of # each user's rating for a special movie or the # ratings of a special user for each movie. # ###################################################### getMSqError<-function(movie=0,user=0,v=TRUE) { stopifnot(!(movie==0 && user==0)) err<-0 n<-ifelse(movie==0, dim(m)[2], dim(m)[1]) for (i in 1:n) { ti <- ifelse(movie==0, user, i) tj <- ifelse(movie==0, i, movie) if (v) print (cfp(m,0,o[tj],ti)) err <- err + (cfp(m,0,o[tj],ti) - m[ti,o[tj]])^2 } sprintf("MSqErr: %1.3f",err/n) } getMError<-function(movie=0,user=0,v=TRUE) { stopifnot(!(movie==0 && user==0)) err<-0 n<-ifelse(movie==0, dim(m)[2], dim(m)[1]) for (i in 1:n) { ti <- ifelse(movie==0, user, i) tj <- ifelse(movie==0, i, movie) if (v) print (cfp(m,0,o[tj],ti)) err <- err + abs(cfp(m,0,o[tj],ti) - m[ti,o[tj]]) } sprintf("MeanErr: %1.3f",(err/n)) } # declare data videos<-c("300", "Juno", "Crank2", "Milk", "Indy4", "Wall-E") m<-matrix(c(5,1,5,1,4,5, 5,2,5,1,5,4, 3,4,2,3,3,2, 2,5,1,4,3,3, 1,3,1,2,1,1),byrow=T,ncol=6) o<-c(3,1,6,5,2,4) # ordering matrix for clustering the dataset plotData(m) plotData(m,o) # a) Crank2 = Film 3 getMError(3) # zum Vergleich die tatsächliche Wertungen: m[,3] # b) # find most predictable movie for (i in 1:6) { print(sprintf("%6s : %s, %s",videos[o[i]],getMError(i, v=FALSE), getMSqError(i, v=FALSE))) } # find most predictable user for (i in 1:5) { print(sprintf("user %1.0f : %s, %s",i,getMError(user=i, v=FALSE), getMSqError(user=i, v=FALSE))) } # # user analysis: # plotData(m,o) pred5 <- c(); pred3<-c() for(i in 1:6) { pred5 <- c(pred5, cfp(m,0,o[i],5)) pred3 <- c(pred3, cfp(m,0,o[i],3))} lines(1:6,pred3,col=rainbow(5)[3],lwd=3,lty=3) lines(1:6,pred5,col=rainbow(5)[5],lwd=3,lty=3) legend("right",legend=c("pred 3","pred 5"), lty=3, lwd=3, col=rainbow(5)[c(3,5)], cex=.75) plotData(m,o) for(j in 1:5) { predX <- c(); for(i in 1:6) { predX <- c(predX, cfp(m,0,o[i],j))} lines(1:6,predX,col=rainbow(5)[j],lwd=3,lty=3) } legend("right",legend=paste("pred", 1:5), lty=3, lwd=3, col=rainbow(5), cex=.75) # c) # random vectors for "Crank2" prediction ne<-c();s<-10000; for (i in 1:s) { ne <- c(ne, cfp(m, sample(1:5,5,replace=TRUE),5)) } mean(ne,na.rm=TRUE) sd(ne,na.rm=TRUE) # d) # random vectors for all movies' predictions ne<-c();s<-10000; for (i in 1:s) { ne <- c(ne, cfp(m, sample(1:5,5,replace=TRUE),sample(1:6,1))) } mean(ne,na.rm=TRUE) sd(ne,na.rm=TRUE) # simulation of biased Dataset dims<-100 m<-matrix(sample(1:5,dims*5,replace=TRUE),ncol=5) m<-cbind(5,m) # random distribution except for 300 ne<-c();s<-1000; for (i in 1:s) { ne <- c(ne, cfp(m, sample(1:5,5,replace=TRUE),sample(1:6,1))) } mean(ne,na.rm=TRUE) # insignificant dependence on dataset sd(ne,na.rm=TRUE)