#########################################################
#
#
#
#
# Exercise - day 3 
#
#
#
##########################################################


# First a selection of important commands and examples
# Please take your time and go attentive through the examples.
# repeate them yourself with diferrent values. 

### vectors
v <- c(13:19,12,13)   # this creates a vector
v
length(v) # returns the length of the vector v
rev(v) # returns the ’rev’ersed vector
sort(v) # returns the sorted vector
indexvec # the index vector for sorting
duplicated(v) # identifies multiple elements
unique(v) # returns vector without multiple elements
some <- ( v > 13 )  # returns a vector with "TRUE" for those values fulfilling the condition
some
which.max(v) # returns the index of the maximum (first such index)
which.min(v) # returns the index of the minimum (first such index)


###
## create and manipulate a vector with 
# 1000 entries and random numbers between -1 and 1 and manipulate it the same way than in the sample above 
# help(runif)

# Matrices are usually created with ’matrix’, by converting a vector into a matrix or by binding
#  vectors together
m <- matrix( data = 1:8, nrow=4, ncol=2 )
m
matrix(1:8,4,2 ) # Same as matrix( data = 1:8, nrow=4, ncol=2 )
### remember: Indexing is first "row", than "column"
m[3,2] # Entry in the third row and second column.
m[2,] # Second row (since the fild behind the comma is empty (no column specified),
# the whole row will be taken with all columns)

m[,2] # Second whole column
m[2:3,1:2] # submatrix


### Create some matrices yourself with different dimensions and values (e.g. random or gaussian)
# and extract different aspekts (rows, columns, submatrices or sigle values).
# Try to find out how to change diffenrent values in the matrix (e.g. change value matrix[2,3] to 60 )


############
# Loops  ###
# A for-loops in R is used to iterate over a vector 
# in each iteration a statement is iterated.
# You only need to specify how many times or upon which conditions 
# those operations need execution. 
# For the first run, you need to assign an initial values to a control loop variable,
# perform the loop.
# See what the following loop is doing with the data

Summe <- 0
for( a in c(2,4,6,8)) {
  summe <- summe + a
  cat("a has now the value ",a,"\n")
  ## '\n' inserts a break
}
Summe

# Please write now a loop yourself to calculate the sum of quares of the first 20 numbers 
Summe_Square <-0
for(n in 1:20) {
  Summe_Square <- Summe_Square+n^2}
Summe_Square


sum(c(2,4,6,8))  # same result with usage of "sum()" command
sum((1:20)^2)


##########  The commands lapply() and tapply()
# The command apply() applys a function to each element of the specified object.
# Also existent: lapply() for vectors lists and data frames
# sapply() same as lapply() but sapply() tries to ’s’implify its output
# in the following are some examples for lapply(), sapply() and apply():
v <- 1:4
v
lapply(v,factorial)           # returns list
sapply(v,factorial)           # returns a 
L <- list(0:3, 5:8, -1:2)    
L
sapply(L,mean)                # mean of each vector in the list
m <- cbind(0:3, 5:8, -1:2) ;

### let's do some example with the command tapply
# The command tapply() is typically applied to data frames. 
# This command is frequently used and therefore important.
# In the following data-frame each individual is either smoker or non-smoker 
# and belongs to one of the three weight classes 1, 2 or 3.
#
riskfactors <- data.frame( individual=1:12, weightcls=rep( 3:1,c(4,4,4) ),smoker=rep(c(0,0,1),4), lifespan=seq(50,72,2) )
riskfactors
attach(riskfactors)   ## this makes life easyer since we 
# don't have to type "riskfactors$individual" any more but can refer now to "individual"

## Question: what is the average lifespan for smokers and non-smokers in our self-generated
# data. This could be done as follows.
mean( lifespan[ smoker==0 ] )
mean( lifespan[ smoker==1 ] )

## However this becomes inconvenient if the factor has many values. 
# More elegant is the command tapply()
# The following command applies the function ’mean’ to the two subvectors of lifespan
# which are determined by the vector ’smoker’.

tapply(lifespan,smoker,mean)
tapply(lifespan,weightcls,mean) # group lifespan according to weightcls

#####################
# Use again the data-set "birthwt" from the first lecture

library(MASS)          
data(birthwt)         
help("birthwt")   
attach(birthwt)
str(birthwt)           
dim(birthwt)
table(birthwt$age)
summary(birthwt$age)

boxplot(birthwt$age)     
boxplot(birthwt$age, main="Boxplot of Mothers Age")    
type.freq <- table(birthwt$age)
type.freq
barplot(type.freq, main="Barplot of age frequencys ") 
sd(birthwt$age)

colnames(birthwt)
colnames(birthwt) <- c("birthwt.below.2500", "mother.age", 
                       "mother.weight", "race",
                       "mother.smokes", "previous.prem.labor", 
                       "hypertension", "uterine.irr",
                       "physician.visits", "birthwt.grams")
head(birthwt)
hist(birthwt.grams)
tapply(lifespan,smoker,mean)
tapply(lifespan,weightcls,mean) # group lifespan according to weightcls

##
# use the t-test to examine, whether smoking during pregnancy and low birth weight are related. 
t.test(birthwt.grams~mother.smokes, mu= 0, data=birthwt)
# what is the p-value of the test telling you ? 
## 


##############
#
#
# Now continue with the excercise from yesterday - health insurance data
#
#

Patients <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Patients.txt")

View(Patients)
head(Patients)
# Have a look at the data in "Patients"
# plot them from different perspektives and create a summary over the two Variables "Age" and "Contribution" 
plot(Patients$Contribution)
plot(sort(Patients$Contribution))
plot(Patients$Age)
plot(sort(Patients$Age))
plot(Patients$Age, Patients$Contribution)
hist(Patients$Age)
title(main="Histogram des Alters von 1000 Patienten")
summary(Patients)
boxplot(Patients$Age)
Nr_Patients <- 1000
##
# Now you'd like to plot a histogram of age with a self-chosen partitioning.
# Please choose the following breakpoints
# 0,5,10,15,...,100
# hint: help(seq)
#       help(hist) -> check here the information about "breaks"
# and compare the outcome to the German population census data
## from http://www.indexmundi.com/germany/age_structure.html
# is the sample representative (at a first glance)?

breakpoints <- seq(from=0,to=100,by=5)
hist(Patients$Age, breaks = breakpoints)

## Load the data "Claims" and "Fever" from the website: 
###        https://cbdm.uni-mainz.de/mb16/
Claims <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Claims.txt", header=FALSE)
#View(Claims)


Fever <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Fever.txt", header=FALSE)
#View(Fever)
#Have a look at the data in "Claims"
# Claims per Patient and Month for one year
summary(Claims)
# Change the Column-Names to show that V1-V12 are the 12 Month
colnames(Claims) <- c("Jan", "Feb", "March", "April","May","June","July","Aug","Sept","Oct","Nov","Dec")
summary(Claims)
## Plot the claims from January 
plot(Claims[1:1000,1])
## Plot the sorted claims  # hint: help("sort")
plot(sort(Claims[1:1000,1]))


##### Plot the claims from patient Nr 1 over the whole year ######
# extract the first row from the table
Claim_P1 <- Claims[1,1:12]    ## row 1 is Patient Nr. 1 and columns 1:12 are the contributions over the whole year.
#remove row-Numbers
d<-data.matrix(Claim_P1, rownames.force = NA)
d

#### Make the plot nicer (add lines between the points and add labels to the axes) ##
# hint: "help(plot)" and "help(title)"
plot(1:12,d[1:12],"b",main="Claims of Patient # 1", xlab="month",ylab="Claims in Euro", col="blue")

## Plot a histogram of the Claims of in January
hist(Claims[1:1000,1])
# add an x-label
hist(Claims[1:1000,1],xlab = "Claims in Euro")
## Plot a Histogram of the Claims in August
hist(Claims[1:1000,8],xlab = "Claims in Euro")
# Plot a Histogram of the total Claims per month over the whole year
# hint: "help(apply)"
row.sums_Claims <- apply(Claims, 1, sum) 
col.sums_Claims <- apply(Claims, 2, sum)
## Create a bar-plot for the monthly claims
barplot(col.sums_Claims,main="Monthly Claims of all Patients # 1", xlab="month",ylab="Claims in Euro", col="blue")

#####  Now have a look at the data in "Fever"   ########
#
# Fever is only recordet in discrete values (yes = 1), (no=0)
#
## Plot the events of patients having fever from January 


#### looks strange,... what is wrong ????

## Plot the sorted claims  # hint: help("sort")

## obviously there are not only 1 and 2 but also strongly negative values in the tabel
## -99 was here the value to flag missing values
# Data sets are often not complete. 
# There might be values which are simply not known. These missing values are recorded as
#  NA (= not available). R deals quite well with missing data. Many commands have arguments
# to tell the command how to deal with NAs. The command for detecting missing values is is.na().
# However, R does not know, that the missing values in THIS datset are called -99.
# R knows by default only "NA"
# recode -99 to missing (NA) in table Fever
Fever[Fever==-99]<-NA
Fever


# Fever per Patient and Month for one year
# Change the Column-Names to show that V1-V12 are the 12 Month
# make afterwards a summary over the data

# In the following, you will have to built the sums over the columns and rows
## hint: 
# help(row.sums)
# help(col.sums)
## How many patients had Fever in January ? 

# How much had patient with the ID 765 Fever over the whole year ?


## Create a bar-plot for the monthly report of fever
barplot(col.sums_Fever)  # All patients having fever per month
barplot(sort(row.sums_Fever)) ## All events of having fever over the year per patient over the year

##### Plot the Fever-Curve from patient Nr 5 over the whole year ######
# extract the row Nr. 5 from the table


##### Plot the Claims-Curve from patient Nr 5 over the whole year ######

####  Plot both curves together in one figure 
# Hint: use par(mfrow=c(2,1))


## Compute how much the insurance company had to pay for her clients and how much she earned via the patients contributions


## Compute a probability for a random person to get sick in a particular month (illnes is interpreted as an insurance claim 
# that is bigger then zero)
# There are different possible ways. Here is one: 
# Check help(length)