Introduction to facthelpeR

Introduction

facthelpeR is a set of functions that are used to read in and do a first pass at cleaning up data.

Load facthelpeR

library(facthelpeR)
library(here)

List the file structure of the data

auxiliary_files is a folder that contains helper files that are used in the curation process. Generally, the *_aux.csvs are manually edited to match controlled vocabularies. They live in a separate folder than the output so that they don’t get written over accidentally.

main_folder <- "inst/extdata/OSU_2019_example"
data_folder <- here::here(main_folder, "data-raw")
auxiliary_files <- here::here(main_folder, "auxiliary_files") 
knitroutput_folder <- here::here(main_folder, "output")

Manual edits

If there are any mistakes in numeric values or one-off typos, sometimes it is faster to fix it in the raw files. Note any manual changes here in the .Rmd for reproducibility.

Read in raw data

Generate a list of all of the files and sheets

If the header is not in the first row (or spans multiple rows), write.csv(sheet_info, ...) and manually fill in the number of rows to skip in each sheet. This data.frame will be read into read_multsheets()

files_csv <- list.files(data_folder, recursive = TRUE, pattern = ".csv")

sheet_info_csv <- data.frame(filename = files_csv) %>% 
  filter(!str_detect(filename, "curation_files")) %>%
  mutate(list_names = basename(filename)) %>%
  mutate(list_names = str_remove(list_names, ".csv")) 

knitr::kable(sheet_info_csv)

filename	list_names
HWEYT_Condon.csv	HWEYT_Condon
HWEYT_Dufur.csv	HWEYT_Dufur

How to handle sheets in .xls files

This section is just to demonstrate how to handle excel files with multiple sheets. The data won’t be processed in the rest of this vignette.

files_xls <- list.files(data_folder, recursive = TRUE, pattern = ".xls")

sheet_info_xls <- list_sheetnames(data_folder, files_xls) 

knitr::kable(sheet_info_xls)

filename	sheets
HWEYT_Dufur_copy.xlsx	HWEYT_Dufur
HWEYT_Dufur_copy.xlsx	extra_sheet

Use read.multsheets()

data_raw1 <- read.multsheets(data_folder, sheet_info_csv, na = c("NA"), col_names = TRUE)
#> This function name is being retained for backwards compatibility.
#>           Please use read_multsheets()
#> Adding header_end
#> [1] "Using read_csv"
#> Rows: 100 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (1): Program ID
#> dbl (10): Plot, Rep, Length_Feet, Length_Inches, Height, Test, Protein, Weig...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> [1] "Using read_csv"
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)
#> Rows: 100 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (1): Program ID
#> dbl (10): Plot, Length_Feet, Length_Inches, Height, Test, Protein, Weight, M...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_raw2 <- imap(data_raw1, function(x, y){ x %>% mutate(sourcefile = y)})


knitr::kable(head(data_raw2[[1]]))

Plot	Rep	Program ID	Length_Feet	Length_Inches	Height	Test	Protein	Weight	Moisture	rep_dup	sourcefile
1	1	HRW20077	15	9	38	61.1	9.2	3.067	15.4	1	HWEYT_Condon
2	1	HRW20023	15	10	36	63.2	12.5	2.240	14.0	1	HWEYT_Condon
3	1	HRW20020	15	10	35	62.2	9.5	2.762	15.1	1	HWEYT_Condon
4	1	HRW20021	15	5	34	61.9	8.9	3.222	16.0	1	HWEYT_Condon
5	1	HRW20019	15	5	33	62.9	11.2	2.489	15.1	1	HWEYT_Condon
6	1	HRW20001	15	4	37	61.9	10.4	2.138	14.0	1	HWEYT_Condon

knitr::kable(head(data_raw2[[2]]))

Plot	Length_Feet	Length_Inches	Height	Test	Protein	Weight	Moisture	Entry	Rep	Program ID	sourcefile
1	15	0	36	62.7	10.4	3.081	14.3	1	1	HRW20077	HWEYT_Dufur
2	14	1	36	64.3	11.7	2.325	12.3	2	1	HRW20023	HWEYT_Dufur
3	14	11	30	63.9	11.5	2.633	14.8	3	1	HRW20020	HWEYT_Dufur
4	13	10	32	63.6	10.5	3.387	15.7	4	1	HRW20021	HWEYT_Dufur
5	14	11	31	64.2	12.1	2.525	13.8	5	1	HRW20019	HWEYT_Dufur
6	15	0	37	63.6	12.3	2.678	14.3	6	1	HRW20001	HWEYT_Dufur

#save(data_raw2,
 #    file = here::here(knitroutput_folder, "data_raw2.RData"))

Remove duplicate columns

Use facthelpeR::rm_dup_col() because it can detect and remove duplicates when: - column names are not exact, but column contents are - if the only difference between two column contents are that one has some NAs, then that column is removed and the other is retained

data_raw3 <- imap(data_raw2, function(x,y){
    print(y)
    rm_dup_col(x, FALSE) %>% 
    select(where(~!all(is.na(.))))
})
#> [1] "HWEYT_Condon"
#> [1] "Removing: rep_dup"
#> [1] "HWEYT_Dufur"
#> [1] "No columns removed"

Curate column names:

Get column names

colnames <- summarize_variables(data_raw3, reg_ex = NULL)
colnames_summary <- colnames %>%
  group_by(colname) %>%
  summarize(across(everything(), ~ paste(na.omit(unique(.x)), collapse = "; ")))

write.csv(colnames_summary, here::here(knitroutput_folder, "colnames_summary.csv"), row.names = FALSE)

knitr::kable(colnames_summary)

colname	type	example	file_sheet	instance
Entry	numeric	1	HWEYT_Dufur	9
Height	numeric	38; 36	HWEYT_Condon; HWEYT_Dufur	6; 4
Length_Feet	numeric	15	HWEYT_Condon; HWEYT_Dufur	4; 2
Length_Inches	numeric	9; 0	HWEYT_Condon; HWEYT_Dufur	5; 3
Moisture	numeric	15.4; 14.3	HWEYT_Condon; HWEYT_Dufur	10; 8
Plot	numeric	1	HWEYT_Condon; HWEYT_Dufur	1
Program ID	character	HRW20077	HWEYT_Condon; HWEYT_Dufur	3; 11
Protein	numeric	9.2; 10.4	HWEYT_Condon; HWEYT_Dufur	8; 6
Rep	numeric	1	HWEYT_Condon; HWEYT_Dufur	2; 10
Test	numeric	61.1; 62.7	HWEYT_Condon; HWEYT_Dufur	7; 5
Weight	numeric	3.067; 3.081	HWEYT_Condon; HWEYT_Dufur	9; 7
sourcefile	character	HWEYT_Condon; HWEYT_Dufur	HWEYT_Condon; HWEYT_Dufur	11; 12

Rename columns using auxiliary files

First, understand what each column means (description and units) by:

checking through annual reports
checking with Julia
lastly, asking the PI

Then, fill out the auxiliary file with the appropriate controlled vocabulary.

If there isn’t a corresponding controlled vocabulary, then need to:

decide if the column can be captured by current controlled vocabulary. If so, standardize to the controlled vocabulary
decide whether we need to keep the variable at all (can fill in auxiliary file with “rm_col” in the controlled_vocab column and rename.col() will remove them if rm_col argument = TRUE)
decide whether a new controlled vocabulary should be added. If so, add to controlled vocabulary using the correct formatting, which is described in the README (I usually get Julia’s opinion on this, especially before adding new vocabularies)

colnames_curation <- 
  read.csv(here::here(auxiliary_files, "colnames_summary_aux.csv")) %>%
  # If a column name still needs to be checked/confirmed, leave the controlled_vocab 
  # column blank in the "colnames_summary_aux.csv" file.  
  # Before the renaming step, just set the unknowns to the 
  # original name.
  mutate(controlled_vocab = 
           ifelse(controlled_vocab == "", colname, controlled_vocab)) 

knitr::kable(colnames_curation)

controlled_vocab	colname	type	example	file_sheet	instance
entry	Entry	numeric	1	HWEYT_Dufur	9
height	Height	numeric	38; 36	HWEYT_Condon; HWEYT_Dufur	6; 4
Length_Feet	Length_Feet	numeric	15	HWEYT_Condon; HWEYT_Dufur	4; 2
Length_Inches	Length_Inches	numeric	9; 0	HWEYT_Condon; HWEYT_Dufur	5; 3
moisture	Moisture	numeric	15.4; 14.3	HWEYT_Condon; HWEYT_Dufur	10; 8
plot	Plot	numeric	1	HWEYT_Condon; HWEYT_Dufur	1
Program ID	Program ID	character	HRW20077	HWEYT_Condon; HWEYT_Dufur	3; 11
Protein	Protein	numeric	9.2; 10.4	HWEYT_Condon; HWEYT_Dufur	8; 6
rep	Rep	numeric	1	HWEYT_Condon; HWEYT_Dufur	2; 10
sourcefile	sourcefile	character	HWEYT_Condon; HWEYT_Dufur	HWEYT_Condon; HWEYT_Dufur	11; 12
test	Test	numeric	61.1; 62.7	HWEYT_Condon; HWEYT_Dufur	7; 5
Weight	Weight	numeric	3.067; 3.081	HWEYT_Condon; HWEYT_Dufur	9; 7


data_raw_rename1 <- rename_col(data_raw3, 
                              rename_df = colnames_curation, 
                              rename_col = controlled_vocab, 
                              old_col = colname, 
                              rm_col = TRUE)
#> [1] "HWEYT_Condon"
#> Renaming all data.frames with the same rename values
#> Columns renamed 'rm_col' will be removed
#> [1] "HWEYT_Dufur"
#> Renaming all data.frames with the same rename values
#> Columns renamed 'rm_col' will be removed

knitr::kable(head(data_raw_rename1[[1]]))

plot	rep	Program ID	Length_Feet	Length_Inches	height	test	Protein	Weight	moisture	sourcefile
1	1	HRW20077	15	9	38	61.1	9.2	3.067	15.4	HWEYT_Condon
2	1	HRW20023	15	10	36	63.2	12.5	2.240	14.0	HWEYT_Condon
3	1	HRW20020	15	10	35	62.2	9.5	2.762	15.1	HWEYT_Condon
4	1	HRW20021	15	5	34	61.9	8.9	3.222	16.0	HWEYT_Condon
5	1	HRW20019	15	5	33	62.9	11.2	2.489	15.1	HWEYT_Condon
6	1	HRW20001	15	4	37	61.9	10.4	2.138	14.0	HWEYT_Condon

knitr::kable(head(data_raw_rename1[[2]]))

plot	Length_Feet	Length_Inches	height	test	Protein	Weight	moisture	entry	rep	Program ID	sourcefile
1	15	0	36	62.7	10.4	3.081	14.3	1	1	HRW20077	HWEYT_Dufur
2	14	1	36	64.3	11.7	2.325	12.3	2	1	HRW20023	HWEYT_Dufur
3	14	11	30	63.9	11.5	2.633	14.8	3	1	HRW20020	HWEYT_Dufur
4	13	10	32	63.6	10.5	3.387	15.7	4	1	HRW20021	HWEYT_Dufur
5	14	11	31	64.2	12.1	2.525	13.8	5	1	HRW20019	HWEYT_Dufur
6	15	0	37	63.6	12.3	2.678	14.3	6	1	HRW20001	HWEYT_Dufur


save(data_raw_rename1,
     file = here::here(knitroutput_folder, "data_raw_rename1.RData"))

Some functions that may help explore the columns within the list of data

There are often instances where column names are not used consistently between files or there are similar column names within one dataset (test vs. testwt). The following functions may help to summarize and explore the data.

See summarize_variables() used above.
summarize_ncol() and summarize_colnames summarizes the information collected from summarize_variables()

List the number of columns that match a given regex per dataset.

cols_summary <- summarize_ncol(data_raw_rename1, c("entry", "length"))
knitr::kable(cols_summary)

file_sheet	entry	length
HWEYT_Dufur	1	2
HWEYT_Condon	NA	2

List the proportion of files that contain a given

cols_prop <- summarize_colnames(data_raw_rename1, c("entry", "length"))
knitr::kable(cols_prop)

colname	prop_files	col_var
entry	0.5	entry
Length_Feet	1.0	length
Length_Inches	1.0	length

Pull all the columns that match a given regex. This is useful for checking to see whether column names have been used consistently within and across datasets.

cols <- select_colsfromlist(data_raw_rename1,  c("entry", "length"))
knitr::kable(cols)

Length_Feet HWEYT_Condon 1	Length_Inches HWEYT_Condon 1	entry HWEYT_Dufur 2	Length_Feet HWEYT_Dufur 2	Length_Inches HWEYT_Dufur 2
15	9	1	15	0
15	10	2	14	1
15	10	3	14	11
15	5	4	13	10
15	5	5	14	11
15	4	6	15	0

Bind all data.frames

data_all1 <- bind_rows(data_raw_rename1)
save(data_all1,
     file = here::here(knitroutput_folder,"data_all1.RData"))

Curate column contents

See the factcuratoR package for validating columns and column contents

Jacqueline Tay

2024-01-16