#########################################################
#
#
#
#
# Exercise - day 3
#
#
#
##########################################################
# First a selection of important commands and examples
# Please take your time and go attentive through the examples.
# repeate them yourself with diferrent values.
### vectors
v <- c(13:19,12,13) # this creates a vector
v
length(v) # returns the length of the vector v
rev(v) # returns the ’rev’ersed vector
sort(v) # returns the sorted vector
indexvec # the index vector for sorting
duplicated(v) # identifies multiple elements
unique(v) # returns vector without multiple elements
some <- ( v > 13 ) # returns a vector with "TRUE" for those values fulfilling the condition
some
which.max(v) # returns the index of the maximum (first such index)
which.min(v) # returns the index of the minimum (first such index)
###
## create and manipulate a vector with
# 1000 entries and random numbers between -1 and 1 and manipulate it the same way than in the sample above
# help(runif)
# Matrices are usually created with ’matrix’, by converting a vector into a matrix or by binding
# vectors together
m <- matrix( data = 1:8, nrow=4, ncol=2 )
m
matrix(1:8,4,2 ) # Same as matrix( data = 1:8, nrow=4, ncol=2 )
### remember: Indexing is first "row", than "column"
m[3,2] # Entry in the third row and second column.
m[2,] # Second row (since the fild behind the comma is empty (no column specified),
# the whole row will be taken with all columns)
m[,2] # Second whole column
m[2:3,1:2] # submatrix
### Create some matrices yourself with different dimensions and values (e.g. random or gaussian)
# and extract different aspekts (rows, columns, submatrices or sigle values).
# Try to find out how to change diffenrent values in the matrix (e.g. change value matrix[2,3] to 60 )
############
# Loops ###
# A for-loops in R is used to iterate over a vector
# in each iteration a statement is iterated.
# You only need to specify how many times or upon which conditions
# those operations need execution.
# For the first run, you need to assign an initial values to a control loop variable,
# perform the loop.
# See what the following loop is doing with the data
Summe <- 0
for( a in c(2,4,6,8)) {
summe <- summe + a
cat("a has now the value ",a,"\n")
## '\n' inserts a break
}
Summe
# Please write now a loop yourself to calculate the sum of quares of the first 20 numbers
Summe_Square <-0
for(n in 1:20) {
Summe_Square <- Summe_Square+n^2}
Summe_Square
sum(c(2,4,6,8)) # same result with usage of "sum()" command
sum((1:20)^2)
########## The commands lapply() and tapply()
# The command apply() applys a function to each element of the specified object.
# Also existent: lapply() for vectors lists and data frames
# sapply() same as lapply() but sapply() tries to ’s’implify its output
# in the following are some examples for lapply(), sapply() and apply():
v <- 1:4
v
lapply(v,factorial) # returns list
sapply(v,factorial) # returns a
L <- list(0:3, 5:8, -1:2)
L
sapply(L,mean) # mean of each vector in the list
m <- cbind(0:3, 5:8, -1:2) ;
### let's do some example with the command tapply
# The command tapply() is typically applied to data frames.
# This command is frequently used and therefore important.
# In the following data-frame each individual is either smoker or non-smoker
# and belongs to one of the three weight classes 1, 2 or 3.
#
riskfactors <- data.frame( individual=1:12, weightcls=rep( 3:1,c(4,4,4) ),smoker=rep(c(0,0,1),4), lifespan=seq(50,72,2) )
riskfactors
attach(riskfactors) ## this makes life easyer since we
# don't have to type "riskfactors$individual" any more but can refer now to "individual"
## Question: what is the average lifespan for smokers and non-smokers in our self-generated
# data. This could be done as follows.
mean( lifespan[ smoker==0 ] )
mean( lifespan[ smoker==1 ] )
## However this becomes inconvenient if the factor has many values.
# More elegant is the command tapply()
# The following command applies the function ’mean’ to the two subvectors of lifespan
# which are determined by the vector ’smoker’.
tapply(lifespan,smoker,mean)
tapply(lifespan,weightcls,mean) # group lifespan according to weightcls
#####################
# Use again the data-set "birthwt" from the first lecture
library(MASS)
data(birthwt)
help("birthwt")
attach(birthwt)
str(birthwt)
dim(birthwt)
table(birthwt$age)
summary(birthwt$age)
boxplot(birthwt$age)
boxplot(birthwt$age, main="Boxplot of Mothers Age")
type.freq <- table(birthwt$age)
type.freq
barplot(type.freq, main="Barplot of age frequencys ")
sd(birthwt$age)
colnames(birthwt)
colnames(birthwt) <- c("birthwt.below.2500", "mother.age",
"mother.weight", "race",
"mother.smokes", "previous.prem.labor",
"hypertension", "uterine.irr",
"physician.visits", "birthwt.grams")
head(birthwt)
hist(birthwt.grams)
tapply(lifespan,smoker,mean)
tapply(lifespan,weightcls,mean) # group lifespan according to weightcls
##
# use the t-test to examine, whether smoking during pregnancy and low birth weight are related.
t.test(birthwt.grams~mother.smokes, mu= 0, data=birthwt)
# what is the p-value of the test telling you ?
##
##############
#
#
# Now continue with the excercise from yesterday - health insurance data
#
#
Patients <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Patients.txt")
View(Patients)
head(Patients)
# Have a look at the data in "Patients"
# plot them from different perspektives and create a summary over the two Variables "Age" and "Contribution"
plot(Patients$Contribution)
plot(sort(Patients$Contribution))
plot(Patients$Age)
plot(sort(Patients$Age))
plot(Patients$Age, Patients$Contribution)
hist(Patients$Age)
title(main="Histogram des Alters von 1000 Patienten")
summary(Patients)
boxplot(Patients$Age)
Nr_Patients <- 1000
##
# Now you'd like to plot a histogram of age with a self-chosen partitioning.
# Please choose the following breakpoints
# 0,5,10,15,...,100
# hint: help(seq)
# help(hist) -> check here the information about "breaks"
# and compare the outcome to the German population census data
## from http://www.indexmundi.com/germany/age_structure.html
# is the sample representative (at a first glance)?
breakpoints <- seq(from=0,to=100,by=5)
hist(Patients$Age, breaks = breakpoints)
## Load the data "Claims" and "Fever" from the website:
### https://cbdm.uni-mainz.de/mb16/
Claims <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Claims.txt", header=FALSE)
#View(Claims)
Fever <- read.delim("https://cbdm.uni-mainz.de/files/2016/02/Fever.txt", header=FALSE)
#View(Fever)
#Have a look at the data in "Claims"
# Claims per Patient and Month for one year
summary(Claims)
# Change the Column-Names to show that V1-V12 are the 12 Month
colnames(Claims) <- c("Jan", "Feb", "March", "April","May","June","July","Aug","Sept","Oct","Nov","Dec")
summary(Claims)
## Plot the claims from January
plot(Claims[1:1000,1])
## Plot the sorted claims # hint: help("sort")
plot(sort(Claims[1:1000,1]))
##### Plot the claims from patient Nr 1 over the whole year ######
# extract the first row from the table
Claim_P1 <- Claims[1,1:12] ## row 1 is Patient Nr. 1 and columns 1:12 are the contributions over the whole year.
#remove row-Numbers
d<-data.matrix(Claim_P1, rownames.force = NA)
d
#### Make the plot nicer (add lines between the points and add labels to the axes) ##
# hint: "help(plot)" and "help(title)"
plot(1:12,d[1:12],"b",main="Claims of Patient # 1", xlab="month",ylab="Claims in Euro", col="blue")
## Plot a histogram of the Claims of in January
hist(Claims[1:1000,1])
# add an x-label
hist(Claims[1:1000,1],xlab = "Claims in Euro")
## Plot a Histogram of the Claims in August
hist(Claims[1:1000,8],xlab = "Claims in Euro")
# Plot a Histogram of the total Claims per month over the whole year
# hint: "help(apply)"
row.sums_Claims <- apply(Claims, 1, sum)
col.sums_Claims <- apply(Claims, 2, sum)
## Create a bar-plot for the monthly claims
barplot(col.sums_Claims,main="Monthly Claims of all Patients # 1", xlab="month",ylab="Claims in Euro", col="blue")
##### Now have a look at the data in "Fever" ########
#
# Fever is only recordet in discrete values (yes = 1), (no=0)
#
## Plot the events of patients having fever from January
#### looks strange,... what is wrong ????
## Plot the sorted claims # hint: help("sort")
## obviously there are not only 1 and 2 but also strongly negative values in the tabel
## -99 was here the value to flag missing values
# Data sets are often not complete.
# There might be values which are simply not known. These missing values are recorded as
# NA (= not available). R deals quite well with missing data. Many commands have arguments
# to tell the command how to deal with NAs. The command for detecting missing values is is.na().
# However, R does not know, that the missing values in THIS datset are called -99.
# R knows by default only "NA"
# recode -99 to missing (NA) in table Fever
Fever[Fever==-99]<-NA
Fever
# Fever per Patient and Month for one year
# Change the Column-Names to show that V1-V12 are the 12 Month
# make afterwards a summary over the data
# In the following, you will have to built the sums over the columns and rows
## hint:
# help(row.sums)
# help(col.sums)
## How many patients had Fever in January ?
# How much had patient with the ID 765 Fever over the whole year ?
## Create a bar-plot for the monthly report of fever
barplot(col.sums_Fever) # All patients having fever per month
barplot(sort(row.sums_Fever)) ## All events of having fever over the year per patient over the year
##### Plot the Fever-Curve from patient Nr 5 over the whole year ######
# extract the row Nr. 5 from the table
##### Plot the Claims-Curve from patient Nr 5 over the whole year ######
#### Plot both curves together in one figure
# Hint: use par(mfrow=c(2,1))
## Compute how much the insurance company had to pay for her clients and how much she earned via the patients contributions
## Compute a probability for a random person to get sick in a particular month (illnes is interpreted as an insurance claim
# that is bigger then zero)
# There are different possible ways. Here is one:
# Check help(length)