######################################################
#
# R interpretation of the humand body height datasets
# body_sizes.txt corresponding to exercise 7-1 b)
# http://www.dbs.ifi.lmu.de/cms/Maschinelles_Lernen_und_Data_Mining_12
#
# programming language R available at
#    http://cran.r-project.org/
#
# available in the CIP pool by
# $ R
#
# This file was created by Marisa Petri (contact: 
# petri[aT}dbs.ifi.lmu.de)
#
######################################################

m <- read.table("body_sizes.txt", header=TRUE, row.names=1)

# number of individuals:
N <- dim(m)[1]

# mu = apply(m, 2, sum)/N
sapply(m, mean)

# ML for sigma is slightly different from function sd() [unbiased estimator for i.i.d. observations]:
sigmaFunction <- function(x) {sqrt(sum((x-mean(x))^2)/(length(x)))}
sapply(m, sigmaFunction)
sapply(m, sd)

# mean differs from median
summary(m)


# visualize first distribution
hist(m[,1])

# view all combined
b  <- hist(unlist(m), breaks=30)$breaks
c1 <- hist(m[,1], breaks=b, plot=FALSE)$counts
c2 <- hist(m[,2], breaks=b, plot=FALSE)$counts
c3 <- hist(m[,3], breaks=b, plot=FALSE)$counts
c4 <- hist(m[,4], breaks=b, plot=FALSE)$counts
lines(b[1:(length(b)-1)] + (b[2]-b[1])/5,   c1, col=2, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*2, c2, col=3, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*3, c3, col=4, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])/5*4, c4, col=5, "h", lwd=3)
legend("topright", c("all", colnames(m)), lwd=c(1, rep(3,4)), col=1:5)


# view combined and separate
layout(matrix(c(1,2,1,3,1,4,1,5), ncol=4)) # for structuring a compound of plots

h  <- hist(unlist(m),breaks=30, plot=FALSE)
b <- h$breaks
c1 <- hist(m[,1], breaks=b, plot=FALSE)$counts
c2 <- hist(m[,2], breaks=b, plot=FALSE)$counts
c3 <- hist(m[,3],breaks=b, plot=FALSE)$counts
c4 <- hist(m[,4],breaks=b, plot=FALSE)$counts

# upper plot
plot(1,1,"n", xlim=c(min(b),max(b)), ylim=c(0,max(c1,c2,c3)), main="Height histograms", xlab="height [cm]", ylab="Frequency")
lines(b[1:(length(b)-1)] + (b[2]-b[1])/5, ifelse(c1==0,NA,c1), col=2, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])*2/5, ifelse(c2==0,NA,c2), col=3, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])*3/5, ifelse(c3==0,NA,c3), col=4, "h", lwd=3)
lines(b[1:(length(b)-1)] + (b[2]-b[1])*4/5, ifelse(c4==0,NA,c4), col=5, "h", lwd=3)
legend("topright", colnames(m), lwd=c(3,3,3,3), col=2:5)

# function for histogram combined with Gaussian density estimate
#
# i:   number of the distribution to be visualized
# lwd: line width of the Gaussian visualizations
hist_with_normal_vis <- function(i, lwd=3) {
  h <- hist(m[,i], breaks=b, main=paste("Histogram of",colnames(m)[i]), xlab="height [cm]")
  mu <- mean(m[,i])
  sigma <- sd(m[,i])
  abline(v=mu, col=5, lwd=lwd)
  arrows(mu-sigma, max(h$counts)/2, mu+sigma, max(h$counts)/2, angle=90, code=3, lwd=lwd, col=6)
  x <- seq(b[1],b[length(b)],len=100)
  lines(x, dnorm(x, mean=mu, sd=sigma) * N, col=8, lwd=lwd)
  legend("topleft", expression(mu, sigma, P(mu,sigma)), lwd=3, col=c(5,6,8))
}
# lower plots
hist_with_normal_vis(1)
hist_with_normal_vis(2)
hist_with_normal_vis(3)
hist_with_normal_vis(4)

# finished with "view combined and separate"


# count the number of elements deviating by at least 
# s * sigma from the mean of the sample distribution
#
# i:   number of the distribution to be visualized
# s:   number of standard deviations to be measured
countWithinSD <- function(i,s) {
  length(which(m[,i]>=mean(m[,i])-s*sd(m[,i]) & m[,i]<=mean(m[,i])+s*sd(m[,i])))
}

countWithinSD(1,1) / N
countWithinSD(2,1) / N
countWithinSD(3,1) / N
countWithinSD(4,1) / N

countWithinSD(1,2) / N
countWithinSD(2,2) / N
countWithinSD(3,2) / N
countWithinSD(4,2) / N

countWithinSD(1,3) / N
countWithinSD(2,3) / N
countWithinSD(3,3) / N
countWithinSD(4,3) / N


# This is how a normally-distributed dataset should be characterized:
# P(X <= mu + sigma && X >= mu - sigma)
pnorm(1)-pnorm(-1)
# 2 sigma
pnorm(2)-pnorm(-2)
# 3 sigma
pnorm(3)-pnorm(-3)