###################################################### # # R interpretation of the humand body height datasets # body_sizes.txt corresponding to exercise 7-1 b) # http://www.dbs.ifi.lmu.de/cms/Maschinelles_Lernen_und_Data_Mining_12 # # programming language R available at # http://cran.r-project.org/ # # available in the CIP pool by # $ R # # This file was created by Marisa Petri (contact: # petri[aT}dbs.ifi.lmu.de) # ###################################################### m <- read.table("body_sizes.txt", header=TRUE, row.names=1) # number of individuals: N <- dim(m)[1] # mu = apply(m, 2, sum)/N sapply(m, mean) # ML for sigma is slightly different from function sd() [unbiased estimator for i.i.d. observations]: sigmaFunction <- function(x) {sqrt(sum((x-mean(x))^2)/(length(x)))} sapply(m, sigmaFunction) sapply(m, sd) # mean differs from median summary(m) # visualize first distribution hist(m[,1]) # view all combined b <- hist(unlist(m), breaks=30)$breaks c1 <- hist(m[,1], breaks=b, plot=FALSE)$counts c2 <- hist(m[,2], breaks=b, plot=FALSE)$counts c3 <- hist(m[,3], breaks=b, plot=FALSE)$counts c4 <- hist(m[,4], breaks=b, plot=FALSE)$counts lines(b[1:(length(b)-1)] + (b[2]-b[1])/5, c1, col=2, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*2, c2, col=3, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*3, c3, col=4, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*4, c4, col=5, "h", lwd=3) legend("topright", c("all", colnames(m)), lwd=c(1, rep(3,4)), col=1:5) # view combined and separate layout(matrix(c(1,2,1,3,1,4,1,5), ncol=4)) # for structuring a compound of plots h <- hist(unlist(m),breaks=30, plot=FALSE) b <- h$breaks c1 <- hist(m[,1], breaks=b, plot=FALSE)$counts c2 <- hist(m[,2], breaks=b, plot=FALSE)$counts c3 <- hist(m[,3],breaks=b, plot=FALSE)$counts c4 <- hist(m[,4],breaks=b, plot=FALSE)$counts # upper plot plot(1,1,"n", xlim=c(min(b),max(b)), ylim=c(0,max(c1,c2,c3)), main="Height histograms", xlab="height [cm]", ylab="Frequency") lines(b[1:(length(b)-1)] + (b[2]-b[1])/5, ifelse(c1==0,NA,c1), col=2, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])*2/5, ifelse(c2==0,NA,c2), col=3, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])*3/5, ifelse(c3==0,NA,c3), col=4, "h", lwd=3) lines(b[1:(length(b)-1)] + (b[2]-b[1])*4/5, ifelse(c4==0,NA,c4), col=5, "h", lwd=3) legend("topright", colnames(m), lwd=c(3,3,3,3), col=2:5) # function for histogram combined with Gaussian density estimate # # i: number of the distribution to be visualized # lwd: line width of the Gaussian visualizations hist_with_normal_vis <- function(i, lwd=3) { h <- hist(m[,i], breaks=b, main=paste("Histogram of",colnames(m)[i]), xlab="height [cm]") mu <- mean(m[,i]) sigma <- sd(m[,i]) abline(v=mu, col=5, lwd=lwd) arrows(mu-sigma, max(h$counts)/2, mu+sigma, max(h$counts)/2, angle=90, code=3, lwd=lwd, col=6) x <- seq(b[1],b[length(b)],len=100) lines(x, dnorm(x, mean=mu, sd=sigma) * N, col=8, lwd=lwd) legend("topleft", expression(mu, sigma, P(mu,sigma)), lwd=3, col=c(5,6,8)) } # lower plots hist_with_normal_vis(1) hist_with_normal_vis(2) hist_with_normal_vis(3) hist_with_normal_vis(4) # finished with "view combined and separate" # count the number of elements deviating by at least # s * sigma from the mean of the sample distribution # # i: number of the distribution to be visualized # s: number of standard deviations to be measured countWithinSD <- function(i,s) { length(which(m[,i]>=mean(m[,i])-s*sd(m[,i]) & m[,i]<=mean(m[,i])+s*sd(m[,i]))) } countWithinSD(1,1) / N countWithinSD(2,1) / N countWithinSD(3,1) / N countWithinSD(4,1) / N countWithinSD(1,2) / N countWithinSD(2,2) / N countWithinSD(3,2) / N countWithinSD(4,2) / N countWithinSD(1,3) / N countWithinSD(2,3) / N countWithinSD(3,3) / N countWithinSD(4,3) / N # This is how a normally-distributed dataset should be characterized: # P(X <= mu + sigma && X >= mu - sigma) pnorm(1)-pnorm(-1) # 2 sigma pnorm(2)-pnorm(-2) # 3 sigma pnorm(3)-pnorm(-3)