solutions

Main Practical

Import Data

Load rio package and import infant data as a tibble:

library(rio)
infant <- import("infant.xlsx", setclass = "tibble")

Subset Data

Remove redundant variables

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

infant2 <- select(infant, -(id:outcome), -sex, -(drace:dwt))
colnames(infant2)

##  [1] "date"      "gestation" "bwt"       "parity"    "race"     
##  [6] "age"       "ed"        "ht"        "wt"        "marital"  
## [11] "inc"       "smoke"     "time"      "number"

Filter Data

Filter the data to exclude extremely premature babies (gestation less than 28 weeks) and extremely late babies (gestation more than 52 weeks).

infant2 <- filter(infant2,
                  !(gestation < 28 * 7 | gestation > 52 * 7))

Set Missing Value Indicator to NA

infant2 <- mutate(infant2, 
                  wt = replace(wt, wt == 999, NA))

Plot child’s weight against the mother’s pre-pregnancy weight

plot(bwt ~ wt, data = infant2)

Convert birthweight to factor

infant2 <- mutate(infant2, 
                  bwtCat = cut(bwt * 28.35, 
                              c(1500, 2000, 2500, 
                                3000, 3500, 5000)))

Create `infant2` all in one go

infant2 <- 
    infant %>%
    select(-(id:outcome), -sex, -(drace:dwt)) %>%
    filter(!(gestation < 28 * 7 | gestation > 52 * 7)) %>%
    mutate(wt = replace(wt, wt == 999, NA),
           bwtCat = cut(bwt * 28.35, 
                        c(1500, 2000, 2500, 
                          3000, 3500, 5000)))

Summarise the number in each birthweight category

infant2 %>% 
    group_by(bwtCat) %>%
    summarise(Count = n())

## # A tibble: 5 x 2
##            bwtCat Count
##            <fctr> <int>
## 1 (1.5e+03,2e+03]     8
## 2 (2e+03,2.5e+03]    54
## 3 (2.5e+03,3e+03]   202
## 4 (3e+03,3.5e+03]   446
## 5 (3.5e+03,5e+03]   511

Extra tasks

Convert `smoke` variable to a factor

library(forcats)
infant2 <- infant2 %>%
    mutate(smoke = factor(smoke),
           smoke = fct_collapse(smoke, 
                                nonsmoker = "0", smoker = as.character(1:3),
                                NULL = "9"))

Tabulate by birthweight and smoking category

“Simple” table

infant2 %>% 
    group_by(bwtCat, smoke) %>%
    summarise(Count = n())

## # A tibble: 13 x 3
## # Groups:   bwtCat [?]
##             bwtCat     smoke Count
##             <fctr>    <fctr> <int>
##  1 (1.5e+03,2e+03] nonsmoker     2
##  2 (1.5e+03,2e+03]    smoker     6
##  3 (2e+03,2.5e+03] nonsmoker    14
##  4 (2e+03,2.5e+03]    smoker    40
##  5 (2.5e+03,3e+03] nonsmoker    57
##  6 (2.5e+03,3e+03]    smoker   144
##  7 (2.5e+03,3e+03]      <NA>     1
##  8 (3e+03,3.5e+03] nonsmoker   193
##  9 (3e+03,3.5e+03]    smoker   250
## 10 (3e+03,3.5e+03]      <NA>     3
## 11 (3.5e+03,5e+03] nonsmoker   270
## 12 (3.5e+03,5e+03]    smoker   235
## 13 (3.5e+03,5e+03]      <NA>     6

Neater table spreading smoke along the columns

library(tidyr)
infant2 %>% 
    group_by(bwtCat, smoke) %>%
    summarise(Count = n()) %>%
    filter(!is.na(smoke)) %>%
    spread(key = smoke, value = Count)

## # A tibble: 5 x 3
## # Groups:   bwtCat [5]
##            bwtCat nonsmoker smoker
## *          <fctr>     <int>  <int>
## 1 (1.5e+03,2e+03]         2      6
## 2 (2e+03,2.5e+03]        14     40
## 3 (2.5e+03,3e+03]        57    144
## 4 (3e+03,3.5e+03]       193    250
## 5 (3.5e+03,5e+03]       270    235