df_codebook_sources = compiled_codebooks %>%
count(domain,subdomain, file, v2_codebooks) %>%
mutate(codebook = convert_full_path_to_file_name(file),
v2_codebooks = ifelse(is.na(v2_codebooks),F,v2_codebooks)) %>%
drop_na() %>%
select(domain, subdomain, codebook,v2_codebooks) %>%
mutate(domain = ifelse( str_detect(codebook,'child'), paste0("Child ", domain), domain))
df_datasets_sources = compiled_datasets %>%
count(domain,subdomain,file) %>%
drop_na() %>%
select(domain, subdomain, file)%>%
mutate(domain = ifelse( str_detect(file,'child'), paste0("Child ", domain), domain))
.x = df_datasets_sources %>%
left_join(df_codebook_sources) %>%
filter(file == 'BEC_L1AD_20210824.csv')
df_sources = df_datasets_sources %>%
left_join(df_codebook_sources) %>%
mutate(grouper = paste(file, codebook)) %>%
group_by(file, codebook) %>%
group_modify(~{
n_domain_tmp = length(unique(.x$domain))
n_subdomain_tmp = nrow(.x)
name_tmp = case_when(
unique(.x$domain) == 'Child Health Risk Factors' ~'Child Health Risk Factors' %>% list(),
n_subdomain_tmp ==1 ~ unique(.x$subdomain) %>% list(),
n_domain_tmp > 1 ~ unique(.x$domain) %>% list(),
TRUE~unique(.x$domain) %>% list()) %>%
unique() %>%
unlist()
tibble(data = name_tmp,
file = unique(.x$file),
codebook = unique(.x$codebook),
v2_codebooks = unique(.x$v2_codebooks))
}) %>%
ungroup() %>%
select(Data = data,
`Codebook V2` = v2_codebooks,
File = file,
Codebook = codebook) %>%
arrange(Data)
df_sources %>%
reactable(
defaultPageSize = 20,
style = list( fontSize = "11px"),
columns = list(
`Codebook V2` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100)),
)FAIR SALURBAL Database Summary
1. Source
1.1 Files
1.2 V2 Codebooks
SALURBAL codebooks will be updated to include system level metadata. Below compares v1 (old) and v2 (new) codebooks.
df_v1 = tibble(
field = codebook__bec %>% select(-file) %>% names(),
v1 = T
)
df_v2 = tibble(
field = codebook__air_pollution %>% select(-file,-v2_codebooks) %>% names(),
v2 = T
)
df_example = codebook__air_pollution %>%
filter(var_name == 'APSPM25MEAN') %>%
mutate_all(~as.character(.x)) %>%
pivot_longer(cols = everything(), names_to = 'field', values_to = 'example')
df_codebook_versions = df_v2 %>%
left_join(df_v1) %>%
left_join(df_example) %>%
select(Field = field, v1, v2, `v2 example` = example) %>%
mutate(v1 = ifelse(is.na(v1),F, v1))
df_codebook_versions %>%
reactable(
defaultPageSize = 20,
style = list( fontSize = "11px"),
columns = list(
`v1` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100),
`v2` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100))
)1.3 Stratification Scheme
The table below defines how we will organiation stratification within the data. There are three columns: 1) attribute - grouping varaible (e.g. Sex) 2) attribute_value are the actual strata (e.g. Female, Males…) 3. default is to signal which strata to display in a non-stratified context (e.g. data catalog table or city profiles viz)
df_strata %>%
select(-default) %>%
arrange(attribute) %>%
reactable(groupBy = "attribute")2. Quality Control
2.1 Tests
The below displays the results of our individual QC tests. If the test has failed you can click each row to see the a summary of the variables that failed.
tests = ls(pattern = 'test_')
results = map_df(tests, ~{
result = get(.x)(compiled_datasets,compiled_codebooks,role = 'test')
tibble(
Test = result$test,
Result = result$pass,
Description = result$desc,
faulty = list(result$faulty))})%>%
arrange(desc(Result))
index_faulty = which(!results$Result)
results %>%
select(-faulty) %>%
reactable(
columns = list(
Test = colDef(width = 100),
Result = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100)),
details = function(index){
if (index%in%index_faulty){
nested_data <- results %>% slice(index) %>% pull(faulty) %>% .[[1]]
is_nestable = nested_data %>% count(var_name) %>% pull(n)>1
index_name = nested_data %>% count(var_name) %>% pull(var_name)
index_nestable = which(is_nestable)
if (any(is_nestable)){
nested_data1= nested_data %>%
select(file, var_name) %>%
distinct()
} else {
nested_data1= nested_data
}
htmltools::div(style = "padding: 16px",
reactable(nested_data1,
outlined = TRUE,
resizable = TRUE,
wrap = FALSE,
bordered = TRUE,
details = function(index){
if (index%in%index_nestable){
data_nested2 = nested_data %>%
filter(var_name == index_name[index]) %>%
select(-file, -var_name)
htmltools::div(style = "padding: 16px",
reactable(data_nested2,
resizable = TRUE,
bordered = F,
wrap = FALSE)) }
}) ) } })2.2 Generate Outputs
We keep subset data to those that have passed all QC tests then merge with codebook metadata for. This object ‘cleaned_datasets’ will be used for downstream datastore generation.
## Get outputs from each test
outputs = map(tests, ~{get(.x)(compiled_datasets,compiled_codebooks,role = 'output')})
## Keep only those rows that passed all tests and operationalize other columns
cleaned_datasets = purrr::reduce(outputs, dplyr::inner_join) %>%
select(-pass) %>%
## operationalize value_type (categorical/discrete/continuous)
group_by(var_name) %>%
group_modify(~{
values = .x$value
.x %>% mutate( value_type = case_when(
any(str_detect(values,"[:alpha:]"))~"categorical",
any(str_detect(values,"\\."))~"continuous",
any(str_detect(var_label,c("Number","Minimum",'Minimim',
"Maximum","GDP per capita",
'City foundation year')))~"continuous",
TRUE~"discrete"
))
}) %>%
ungroup() %>%
## Add var_name_base_labels
mutate(var_name_base_label = case_when(
n_attr == 0 ~ NA_character_,
n_attr == 1 ~ attribute1,
n_attr == 2 ~ glue("{attribute2} ({attribute1_value})") %>% paste0()
)) %>%
select(var_name, var_raw, geo, salid1,country, city,year, value,value_type,
everything())
# {## Check value_type
# ## Distribution of value_type
# cleaned_datasets %>%
# count(var_name,value_type) %>%
# count(value_type)
# ## Get list of discrete variables
# cleaned_datasets %>%
# filter(value_type == 'discrete') %>%
# count(var_name, var_label) %>% View()
# }
#' Add longitudinal coding labels. This function will categorize variables based on
#' what year and country available. THere will be four categories:
#' 1. Longitudinal - when variable is longitudinal
#' 2. Completely non-longitudinal - when a variable only has a single year value.
#' 3. Functionally non-longitudinal - when a variable only has a single year value per country;
#' this is for when data source differ by a few years across country. For example life expectancy is
#' '2010-2014' for El Salvador but '2012-2016' for all other countries.
#' 4. Mixed by Country - some countries non-longitudinal but some countries have dta vailable for multiple years.
{ ## Prepare longitudinal_status
xwalk_years_available = cleaned_datasets %>%
select(var_name, subdomain,longitudinal, country, year) %>%
distinct() %>%
# filter(!longitudinal) %>%
group_by(subdomain, var_name) %>%
group_modify(~{
# .x = cleaned_datasets %>%
# select(var_name, subdomain,longitudinal, country, year) %>%
# distinct() %>% filter(var_name == "LEALE")
# .x = cleaned_datasets %>%
# select(var_name, subdomain,longitudinal, country, year) %>%
# distinct() %>% filter(var_name == "CNSMIGR_FORBORN")
years_all = .x %>% count(year)
years = unique(.x$year)
table_year_country = .x %>% count(country )
statusTmp = case_when(
length(years)==1~ "Completely non-longitudinal",
all(table_year_country$n == 1) ~"Functionally non-longitudinal",
length(unique(.x$longitudinal))>1~"Mixed by Country",
unique(.x$longitudinal)~"Longitudinal",
TRUE~"Mixed by Country"
) %>%
unique()
## Check for non-numeric outliers
if (is.na( min(as.numeric(years)))|is.na( max(as.numeric(years)))){
stop("ERROR: Non-numeric year value!")
}
## standardize year values for 'functionally non-longitudinal values'
# year_min_tmp = ifelse(
# str_detect(statusTmp, 'non-longitudinal'),
# years_all %>% filter(n == max(n)) %>% slice(1) %>% pull(year) %>% as.numeric(),
# min(as.numeric(years))
# )
# year_max_tmp = ifelse(
# str_detect(statusTmp, 'non-longitudinal'),
# years_all %>% filter(n == max(n)) %>% slice(1) %>% pull(year)%>% as.numeric(),
# min(as.numeric(years))
# )
tibble(
n_years = length(years),
year_min = min(as.numeric(years)),
year_max = max(as.numeric(years)),
n_countries = length(unique(.x$country)),
chr_years = paste0(years, collapse = ', '),
longitudinal_status = statusTmp
) %>%
mutate(years = list(as.numeric(unique(.x$year))))
}) %>%
ungroup() %>%
select(var_name, longitudinal_status, n_years, years, year_min, year_max)
}
{
xwalk_var_name_nested = cleaned_datasets %>% select(var_merge, var_name_nested) %>% distinct() %>%
filter(!is.na(var_merge))
}
cleaned_codebooks = compiled_codebooks %>%
filter(var_name%in%cleaned_datasets$var_name ) %>%
left_join(xwalk_years_available) %>%
left_join(xwalk_var_name_nested)
## Check for that nested var_name codebook entries have var_name_nested
checkCodebookNested = cleaned_codebooks %>%
add_count(var_name) %>%
select(var_name, var_merge, var_name_nested, n) %>%
filter(n>1) %>%
arrange(desc(n)) %>%
filter(is.na(var_merge)| is.na(var_name_nested)) %>%
nrow() > 0
if (checkCodebookNested){
stop("ERROR: Some neste codebook entries are missing var_merge/var_name_nested")
}
cleaned_datasets %>%
slice(1:10) %>%
reactable( resizable = TRUE, wrap = FALSE, bordered = TRUE)2.3 Validation of Outputs
The another round of QC is run to validate the quality of our final output. The results of the validation are shown in the table below.
## Make sure that all tests pass
validaiton_results = map_df(tests, ~{
result = get(.x)(cleaned_datasets,cleaned_codebooks,role = 'test')
tibble(
Test = result$test,
Result = result$pass,
Description = result$desc,
faulty = list(result$faulty))})
validaiton_results %>%
select(-faulty) %>%
reactable(
columns = list(
Test = colDef(width = 100),
Result = colDef(cell = function(value) qc_icon_formatter(value), width = 100)
))3. Database Summary
n_variables = cleaned_datasets %>% count(var_name) %>% nrow()
n_datasets = nrow(df_sources)From 16 datasets/codebook pairs we have cleaned 194 variables. Below are some metrics which summarize what is available in our compiled database.
3.1 Variables Count
How many variables for each dataset?
## Sankey
df_sankey_count = cleaned_datasets %>%
count(domain,subdomain, var_name) %>%
mutate(all = glue("Cleaned Variables ({n_variables})")) %>%
select(all,domain, subdomain)
hchart(data_to_sankey(df_sankey_count), "sankey", name = "Variable Count")3.2 Stratification
How many is our data stratified?
df2a = cleaned_datasets %>%
count(var_name, subdomain, domain, attribute1) %>%
mutate(attribute1 = ifelse(is.na(attribute1),"Not Stratified", "Stratification Available")) %>%
select(stratified = attribute1, domain )
# hchart(data_to_sankey(df2a), "sankey", name = "Variable Count")
table_strata_by_file = cleaned_datasets %>%
count(domain, var_name, attribute1, attribute2) %>%
mutate(stratified = ifelse(!is.na(attribute1), 'Available', 'Not Available')) %>%
count(domain,stratified) %>%
## Order domain
group_by(domain) %>%
mutate(n_domain = sum(n))%>%
ungroup() %>%
mutate(domain = fct_reorder(as.factor(domain), (n_domain)))
{ ## b. Examine strata levels
## All strata vars
dfa = cleaned_datasets %>%
count(file, var_name, attribute1, attribute2) %>%
filter(!is.na(attribute1))%>%
select(-n)
## Single attribute variables
df1 = dfa %>%
filter(is.na(attribute2)) %>%
left_join(cleaned_datasets) %>%
count(domain, var_name, attribute1,attribute1_value)
# df1
## Double attribute variables
df2 = dfa %>%
filter(!is.na(attribute2)) %>%
left_join(cleaned_datasets) %>%
count(domain, var_name,
attribute1,attribute1_value,
attribute2,attribute2_value )
# df2
}
## Plot
dfb = cleaned_datasets %>%
mutate(stratified2 =case_when(
is.na(attribute1)~'Not Stratified',
is.na(attribute2)~'Single Stratification',
TRUE~'Double Stratification'
) )%>%
count(domain,var_name,stratified2,attribute1, attribute2) %>%
rowwise() %>%
mutate(attribute = paste(attribute1,attribute2, sep = '+') %>%
str_remove_all('NA') %>%
str_trim() %>%
ifelse(str_sub(.,-1L)=='+',str_sub(.,0,-1L-1),. ) %>%
ifelse(.=='',NA,.),
stratified = ifelse(stratified2=='Not Stratified','Not Stratified','Stratified')) %>%
ungroup() %>%
select(domain,stratified2, attribute)
hchart(data_to_sankey(dfb), "sankey", name = "Stratification Count"){## Operationalize statistics
n_var_with_strata = table_strata_by_file %>% filter(stratified == "Available") %>% pull(n) %>% sum()
n_var_without_strata = table_strata_by_file %>% filter(stratified != "Available") %>% pull(n) %>% sum()
n_var_with_1_strata = df1 %>% count(var_name) %>% nrow()
n_var_with_2_strata = df2 %>% count(var_name) %>% nrow()
}Out of 194 variables, only 23 variables are have stratification available. The different stratas available cane be seen below.
3.3 Value Type
What type of data is available?
dfa = cleaned_datasets %>%
count(var_name, value_type) %>%
count(value_type)
dfa %>%
reactable(
details = function(index){
dfa_tmp = cleaned_datasets %>%
count(var_name, var_label, value_type) %>%
filter(value_type ==dfa %>% slice(index) %>% pull(value_type) )
htmltools::div(style = "padding: 16px",
reactable(dfa_tmp,
outlined = TRUE,
resizable = TRUE,
wrap = FALSE,
bordered = TRUE) ) }
)3.4 Longitudinal Data
EDA of longitudinality in SALURBAL data. Note that whether variables are longitudinal is not available in v1 codebook. pending v2 codebooks they will be assigned by groups themselves and thus should be more accurate. Right now only 9 variables from v2 codebooks (Air Pollution have DMC assigned longitudinal value).
cleaned_codebooks %>%
count(var_name, longitudinal) %>%
tabyl(longitudinal) %>%
reactable()But we can back-calculate longitudinality from the data it self, for now we will use these ‘longitudinal_status’ as a metadata field for our datastore. Roughly 65% of the variables are not longitudinal. Here we document the four possible cases of longitudinality which were coded for each variable based on year availability by year.
cleaned_codebooks %>%
select(var_name, subdomain,longitudinal_status) %>%
distinct() %>%
tabyl(longitudinal_status) %>%
arrange(desc(percent)) %>%
reactable()Longitudinal is when variable is longitudinal
Completely non-longitudinal is when a variable only has a single year value.
Functionally non-longitudinal is when a variable only has a single year value per country; this is for when data source differ by a few years across country. For example life expectancy is ‘2010-2014’ for El Salvador but ‘2012-2016’ for all other countries.
Mixed When the variable identified as non-longitudinal but does not fall into the expected two cases above. It has single years for some countries but multiple years for other countries. These come from two subdomains 1) migration and 2) segregation. The distribution of data availabl by year and country for these two datasets are displayed below.
Segregation
Migration
Moving forward we need to to make some decisions about how to address each of these cases during the check out selection and downloads steps.