######################################################### # # # # # Exercise - day 3 # # # ########################################################## # First a selection of important commands and examples # Please take your time and go attentive through the examples. # repeate them yourself with diferrent values. ### vectors v <- c(13:19,12,13) # this creates a vector v length(v) # returns the length of the vector v rev(v) # returns the ’rev’ersed vector sort(v) # returns the sorted vector indexvec # the index vector for sorting duplicated(v) # identifies multiple elements unique(v) # returns vector without multiple elements some <- ( v > 13 ) # returns a vector with "TRUE" for those values fulfilling the condition some which.max(v) # returns the index of the maximum (first such index) which.min(v) # returns the index of the minimum (first such index) ### ## create and manipulate a vector with # 1000 entries and random numbers between -1 and 1 and manipulate it the same way than in the sample above # help(runif) # Matrices are usually created with ’matrix’, by converting a vector into a matrix or by binding # vectors together m <- matrix( data = 1:8, nrow=4, ncol=2 ) m matrix(1:8,4,2 ) # Same as matrix( data = 1:8, nrow=4, ncol=2 ) ### remember: Indexing is first "row", than "column" m[3,2] # Entry in the third row and second column. m[2,] # Second row (since the fild behind the comma is empty (no column specified), # the whole row will be taken with all columns) m[,2] # Second whole column m[2:3,1:2] # submatrix ### Create some matrices yourself with different dimensions and values (e.g. random or gaussian) # and extract different aspekts (rows, columns, submatrices or sigle values). # Try to find out how to change diffenrent values in the matrix (e.g. change value matrix[2,3] to 60 ) ############ # Loops ### # A for-loops in R is used to iterate over a vector # in each iteration a statement is iterated. # You only need to specify how many times or upon which conditions # those operations need execution. # For the first run, you need to assign an initial values to a control loop variable, # perform the loop. # See what the following loop is doing with the data Summe <- 0 for( a in c(2,4,6,8)) { summe <- summe + a cat("a has now the value ",a,"\n") ## '\n' inserts a break } Summe # Please write now a loop yourself to calculate the sum of quares of the first 20 numbers Summe_Square <-0 for(n in 1:20) { Summe_Square <- Summe_Square+n^2} Summe_Square sum(c(2,4,6,8)) # same result with usage of "sum()" command sum((1:20)^2) ########## The commands lapply() and tapply() # The command apply() applys a function to each element of the specified object. # Also existent: lapply() for vectors lists and data frames # sapply() same as lapply() but sapply() tries to ’s’implify its output # in the following are some examples for lapply(), sapply() and apply(): v <- 1:4 v lapply(v,factorial) # returns list sapply(v,factorial) # returns a L <- list(0:3, 5:8, -1:2) L sapply(L,mean) # mean of each vector in the list m <- cbind(0:3, 5:8, -1:2) ; ### let's do some example with the command tapply # The command tapply() is typically applied to data frames. # This command is frequently used and therefore important. # In the following data-frame each individual is either smoker or non-smoker # and belongs to one of the three weight classes 1, 2 or 3. # riskfactors <- data.frame( individual=1:12, weightcls=rep( 3:1,c(4,4,4) ),smoker=rep(c(0,0,1),4), lifespan=seq(50,72,2) ) riskfactors attach(riskfactors) ## this makes life easyer since we # don't have to type "riskfactors$individual" any more but can refer now to "individual" ## Question: what is the average lifespan for smokers and non-smokers in our self-generated # data. This could be done as follows. mean( lifespan[ smoker==0 ] ) mean( lifespan[ smoker==1 ] ) ## However this becomes inconvenient if the factor has many values. # More elegant is the command tapply() # The following command applies the function ’mean’ to the two subvectors of lifespan # which are determined by the vector ’smoker’. tapply(lifespan,smoker,mean) tapply(lifespan,weightcls,mean) # group lifespan according to weightcls ##################### # Use again the data-set "birthwt" from the first lecture library(MASS) data(birthwt) help("birthwt") attach(birthwt) str(birthwt) dim(birthwt) table(birthwt$age) summary(birthwt$age) boxplot(birthwt$age) boxplot(birthwt$age, main="Boxplot of Mothers Age") type.freq <- table(birthwt$age) type.freq barplot(type.freq, main="Barplot of age frequencys ") sd(birthwt$age) colnames(birthwt) colnames(birthwt) <- c("birthwt.below.2500", "mother.age", "mother.weight", "race", "mother.smokes", "previous.prem.labor", "hypertension", "uterine.irr", "physician.visits", "birthwt.grams") head(birthwt) hist(birthwt.grams) tapply(lifespan,smoker,mean) tapply(lifespan,weightcls,mean) # group lifespan according to weightcls ## # use the t-test to examine, whether smoking during pregnancy and low birth weight are related. t.test(birthwt.grams~mother.smokes, mu= 0, data=birthwt) # what is the p-value of the test telling you ? ## ############## # # # Now continue with the excercise from yesterday - health insurance data # # Patients <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Patients.txt") View(Patients) head(Patients) # Have a look at the data in "Patients" # plot them from different perspektives and create a summary over the two Variables "Age" and "Contribution" plot(Patients$Contribution) plot(sort(Patients$Contribution)) plot(Patients$Age) plot(sort(Patients$Age)) plot(Patients$Age, Patients$Contribution) hist(Patients$Age) title(main="Histogram des Alters von 1000 Patienten") summary(Patients) boxplot(Patients$Age) Nr_Patients <- 1000 ## # Now you'd like to plot a histogram of age with a self-chosen partitioning. # Please choose the following breakpoints # 0,5,10,15,...,100 # hint: help(seq) # help(hist) -> check here the information about "breaks" # and compare the outcome to the German population census data ## from http://www.indexmundi.com/germany/age_structure.html # is the sample representative (at a first glance)? breakpoints <- seq(from=0,to=100,by=5) hist(Patients$Age, breaks = breakpoints) ## Load the data "Claims" and "Fever" from the website: ### https://cbdm.uni-mainz.de/mb16/ Claims <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Claims.txt", header=FALSE) #View(Claims) Fever <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Fever.txt", header=FALSE) #View(Fever) #Have a look at the data in "Claims" # Claims per Patient and Month for one year summary(Claims) # Change the Column-Names to show that V1-V12 are the 12 Month colnames(Claims) <- c("Jan", "Feb", "March", "April","May","June","July","Aug","Sept","Oct","Nov","Dec") summary(Claims) ## Plot the claims from January plot(Claims[1:1000,1]) ## Plot the sorted claims # hint: help("sort") plot(sort(Claims[1:1000,1])) ##### Plot the claims from patient Nr 1 over the whole year ###### # extract the first row from the table Claim_P1 <- Claims[1,1:12] ## row 1 is Patient Nr. 1 and columns 1:12 are the contributions over the whole year. #remove row-Numbers d<-data.matrix(Claim_P1, rownames.force = NA) d #### Make the plot nicer (add lines between the points and add labels to the axes) ## # hint: "help(plot)" and "help(title)" plot(1:12,d[1:12],"b",main="Claims of Patient # 1", xlab="month",ylab="Claims in Euro", col="blue") ## Plot a histogram of the Claims of in January hist(Claims[1:1000,1]) # add an x-label hist(Claims[1:1000,1],xlab = "Claims in Euro") ## Plot a Histogram of the Claims in August hist(Claims[1:1000,8],xlab = "Claims in Euro") # Plot a Histogram of the total Claims per month over the whole year # hint: "help(apply)" row.sums_Claims <- apply(Claims, 1, sum) col.sums_Claims <- apply(Claims, 2, sum) ## Create a bar-plot for the monthly claims barplot(col.sums_Claims,main="Monthly Claims of all Patients # 1", xlab="month",ylab="Claims in Euro", col="blue") ##### Now have a look at the data in "Fever" ######## # # Fever is only recordet in discrete values (yes = 1), (no=0) # ## Plot the events of patients having fever from January #### looks strange,... what is wrong ???? ## Plot the sorted claims # hint: help("sort") ## obviously there are not only 1 and 2 but also strongly negative values in the tabel ## -99 was here the value to flag missing values # Data sets are often not complete. # There might be values which are simply not known. These missing values are recorded as # NA (= not available). R deals quite well with missing data. Many commands have arguments # to tell the command how to deal with NAs. The command for detecting missing values is is.na(). # However, R does not know, that the missing values in THIS datset are called -99. # R knows by default only "NA" # recode -99 to missing (NA) in table Fever Fever[Fever==-99]<-NA Fever # Fever per Patient and Month for one year # Change the Column-Names to show that V1-V12 are the 12 Month # make afterwards a summary over the data # In the following, you will have to built the sums over the columns and rows ## hint: # help(row.sums) # help(col.sums) ## How many patients had Fever in January ? # How much had patient with the ID 765 Fever over the whole year ? ## Create a bar-plot for the monthly report of fever barplot(col.sums_Fever) # All patients having fever per month barplot(sort(row.sums_Fever)) ## All events of having fever over the year per patient over the year ##### Plot the Fever-Curve from patient Nr 5 over the whole year ###### # extract the row Nr. 5 from the table ##### Plot the Claims-Curve from patient Nr 5 over the whole year ###### #### Plot both curves together in one figure # Hint: use par(mfrow=c(2,1)) ## Compute how much the insurance company had to pay for her clients and how much she earned via the patients contributions ## Compute a probability for a random person to get sick in a particular month (illnes is interpreted as an insurance claim # that is bigger then zero) # There are different possible ways. Here is one: # Check help(length)