## ----dir----------------------------------------------------------------- ## an example using a directory on linux dir <- "/media/Repos/wdsi-vacation-school/Lecture_1_Data_Handling/" ## an example using a directory on Windows dir <- "E:/wdsi-vacation-school/Lecture_1_Data_Handling/" ## list files in directory files <- list.files(file.path(dir)) ## import file from directory cyclist <- import(file.path(dir, "cyclist.xlsx")) ## ----import-------------------------------------------------------------- library(rio) compsci <- import("compsci.csv") cyclist <- import("cyclist.xlsx") ## ----recycle------------------------------------------------------------- library(tibble) dat <- data_frame(x = 1:3, y = TRUE) ## ----print_tbl----------------------------------------------------------- compsci <- as_data_frame(compsci) print(compsci, n = 2, width = 100) ## ----gather-------------------------------------------------------------- library(tidyr) compsci2 <- gather(compsci, key = "Student Group", value = "Number of students", -Year) print(compsci2, n = 3) ## ----separate------------------------------------------------------------ compsci2 <- separate(compsci2, col = `Student Group`, into = c("Degree", "Gender"), sep = " - ") print(compsci2, n = 3) ## ----filter-------------------------------------------------------------- library(dplyr) filter(compsci2, Gender == "Males" & `Number of students` > 40000) ## ----arrange------------------------------------------------------------- print(arrange(compsci2, desc(Year), Gender), n = 4) ## ----select-------------------------------------------------------------- select(compsci2, Year, Gender, `Number of students`) select(compsci2, Year:Degree, `Number of students`) select(compsci2, -Degree, -Gender) select(compsci2, -(Year:Gender)) ## ----starts_with--------------------------------------------------------- print(select(compsci, starts_with("Bachelor's")), n = 2) ## ----select-rename------------------------------------------------------- select(compsci2, `Academic Year` = Year, Gender, `Number of students`) ## ----rename,------------------------------------------------------------- rename(compsci2, `Academic Year` = Year) ## ----distinct------------------------------------------------------------ distinct(compsci2, Degree, Gender) ## ----mutate-------------------------------------------------------------- dat <- mutate(compsci2, Postgrad = Degree != "Bachelor's", Year = gsub(".", "", Year, fixed = TRUE), Year = sub(" ", "", Year, fixed = TRUE)) print(dat, n = 2) ## ----transmute----------------------------------------------------------- dat <- transmute(compsci2, Postgrad = Degree != "Bachelor's", Year = gsub(".", "", Year, fixed = TRUE), Year = sub(" ", "", Year, fixed = TRUE)) print(dat, n = 2) ## ----summarise----------------------------------------------------------- summarise(compsci2, Average = mean(`Number of students`), Total = sum(`Number of students`)) ## ----summarise_if-------------------------------------------------------- dat <- rename(compsci2, nStudents = `Number of students`) #bug hack summarise_if(dat, is.numeric, mean) ## ----steps--------------------------------------------------------------- dat <- mutate(compsci2, Year = gsub(".", "", Year, fixed = TRUE), Year = sub(" ", "", Year, fixed = TRUE)) dat <- filter(dat, Year == "2013-14" & Degree != "Bachelor's") select(dat, -Year) ## ----chain--------------------------------------------------------------- compsci2 %>% mutate(Year = gsub(".", "", Year, fixed = TRUE), Year = sub(" ", "", Year, fixed = TRUE)) %>% filter(Year == "2013-14" & Degree != "Bachelor's") %>% select(-Year) ## ----pipe-aware---------------------------------------------------------- compsci %>% gather(key = "Student Group", value = "Number of students", -Year) %>% separate(col = `Student Group`, into = c("Degree", "Gender"), sep = " - ") %>% mutate(Year = gsub(".", "", Year, fixed = TRUE), Year = sub(" ", "", Year, fixed = TRUE)) %>% filter(Year == "2013-14" & Degree != "Bachelor's") %>% select(-Year) ## ----group_by------------------------------------------------------------ compsci2 %>% filter(grepl("2013-14", Year)) %>% group_by(Gender) %>% select(Degree, `Number of students`) %>% arrange(`Number of students`) ## ----grouped_mutate------------------------------------------------------ compsci2 %>% filter(grepl("2013-14", Year)) %>% group_by(Gender) %>% mutate(`Relative number` = 100 * `Number of students`/max(`Number of students`)) ## ----grouped_summarise--------------------------------------------------- compsci2 %>% filter(grepl("2013-14", Year)) %>% group_by(Gender) %>% summarise(Total = sum(`Number of students`)) ## ----factor-------------------------------------------------------------- dat <- compsci2 %>% mutate(Year = factor(Year), Degree = factor(Degree), Gender = factor(Gender)) summary(select(dat, -`Number of students`)) ## ----lev----------------------------------------------------------------- mutate(compsci2, Gender = factor(Gender, levels = c("Male", "Female"), labels = c("M", "F"))) ## ----forcats------------------------------------------------------------- library(forcats) dat %>% mutate(Degree = fct_inorder(Degree), Gender = fct_relevel(Gender, Males)) summary(dat) ## ----export-------------------------------------------------------------- export(compsci2, "compsci2.csv") ## ----rds----------------------------------------------------------------- saveRDS(compsci2, "compsci_tidy.rds") genderbalance <- readRDS("compsci_tidy.rds") print(genderbalance, 2)