solutions
Main Practical
Import Data
Load rio package and import infant data as a tibble:
library(rio)
infant <- import("infant.xlsx", setclass = "tibble")
Subset Data
Remove redundant variables
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
infant2 <- select(infant, -(id:outcome), -sex, -(drace:dwt))
colnames(infant2)
## [1] "date" "gestation" "bwt" "parity" "race"
## [6] "age" "ed" "ht" "wt" "marital"
## [11] "inc" "smoke" "time" "number"
Filter Data
Filter the data to exclude extremely premature babies (gestation less than 28 weeks) and extremely late babies (gestation more than 52 weeks).
infant2 <- filter(infant2,
!(gestation < 28 * 7 | gestation > 52 * 7))
Set Missing Value Indicator to NA
infant2 <- mutate(infant2,
wt = replace(wt, wt == 999, NA))
Plot child’s weight against the mother’s pre-pregnancy weight
plot(bwt ~ wt, data = infant2)
Convert birthweight to factor
infant2 <- mutate(infant2,
bwtCat = cut(bwt * 28.35,
c(1500, 2000, 2500,
3000, 3500, 5000)))
Create infant2
all in one go
infant2 <-
infant %>%
select(-(id:outcome), -sex, -(drace:dwt)) %>%
filter(!(gestation < 28 * 7 | gestation > 52 * 7)) %>%
mutate(wt = replace(wt, wt == 999, NA),
bwtCat = cut(bwt * 28.35,
c(1500, 2000, 2500,
3000, 3500, 5000)))
Summarise the number in each birthweight category
infant2 %>%
group_by(bwtCat) %>%
summarise(Count = n())
## # A tibble: 5 x 2
## bwtCat Count
## <fctr> <int>
## 1 (1.5e+03,2e+03] 8
## 2 (2e+03,2.5e+03] 54
## 3 (2.5e+03,3e+03] 202
## 4 (3e+03,3.5e+03] 446
## 5 (3.5e+03,5e+03] 511
Extra tasks
Convert smoke
variable to a factor
library(forcats)
infant2 <- infant2 %>%
mutate(smoke = factor(smoke),
smoke = fct_collapse(smoke,
nonsmoker = "0", smoker = as.character(1:3),
NULL = "9"))
Tabulate by birthweight and smoking category
“Simple” table
infant2 %>%
group_by(bwtCat, smoke) %>%
summarise(Count = n())
## # A tibble: 13 x 3
## # Groups: bwtCat [?]
## bwtCat smoke Count
## <fctr> <fctr> <int>
## 1 (1.5e+03,2e+03] nonsmoker 2
## 2 (1.5e+03,2e+03] smoker 6
## 3 (2e+03,2.5e+03] nonsmoker 14
## 4 (2e+03,2.5e+03] smoker 40
## 5 (2.5e+03,3e+03] nonsmoker 57
## 6 (2.5e+03,3e+03] smoker 144
## 7 (2.5e+03,3e+03] <NA> 1
## 8 (3e+03,3.5e+03] nonsmoker 193
## 9 (3e+03,3.5e+03] smoker 250
## 10 (3e+03,3.5e+03] <NA> 3
## 11 (3.5e+03,5e+03] nonsmoker 270
## 12 (3.5e+03,5e+03] smoker 235
## 13 (3.5e+03,5e+03] <NA> 6
Neater table spreading smoke along the columns
library(tidyr)
infant2 %>%
group_by(bwtCat, smoke) %>%
summarise(Count = n()) %>%
filter(!is.na(smoke)) %>%
spread(key = smoke, value = Count)
## # A tibble: 5 x 3
## # Groups: bwtCat [5]
## bwtCat nonsmoker smoker
## * <fctr> <int> <int>
## 1 (1.5e+03,2e+03] 2 6
## 2 (2e+03,2.5e+03] 14 40
## 3 (2.5e+03,3e+03] 57 144
## 4 (3e+03,3.5e+03] 193 250
## 5 (3.5e+03,5e+03] 270 235