= compiled_codebooks %>%
df_codebook_sources count(domain,subdomain, file, v2_codebooks) %>%
mutate(codebook = convert_full_path_to_file_name(file),
v2_codebooks = ifelse(is.na(v2_codebooks),F,v2_codebooks)) %>%
drop_na() %>%
select(domain, subdomain, codebook,v2_codebooks) %>%
mutate(domain = ifelse( str_detect(codebook,'child'), paste0("Child ", domain), domain))
= compiled_datasets %>%
df_datasets_sources count(domain,subdomain,file) %>%
drop_na() %>%
select(domain, subdomain, file)%>%
mutate(domain = ifelse( str_detect(file,'child'), paste0("Child ", domain), domain))
= df_datasets_sources %>%
.x left_join(df_codebook_sources) %>%
filter(file == 'BEC_L1AD_20210824.csv')
= df_datasets_sources %>%
df_sources left_join(df_codebook_sources) %>%
mutate(grouper = paste(file, codebook)) %>%
group_by(file, codebook) %>%
group_modify(~{
= length(unique(.x$domain))
n_domain_tmp = nrow(.x)
n_subdomain_tmp = case_when(
name_tmp unique(.x$domain) == 'Child Health Risk Factors' ~'Child Health Risk Factors' %>% list(),
==1 ~ unique(.x$subdomain) %>% list(),
n_subdomain_tmp > 1 ~ unique(.x$domain) %>% list(),
n_domain_tmp TRUE~unique(.x$domain) %>% list()) %>%
unique() %>%
unlist()
tibble(data = name_tmp,
file = unique(.x$file),
codebook = unique(.x$codebook),
v2_codebooks = unique(.x$v2_codebooks))
%>%
}) ungroup() %>%
select(Data = data,
`Codebook V2` = v2_codebooks,
File = file,
Codebook = codebook) %>%
arrange(Data)
%>%
df_sources reactable(
defaultPageSize = 20,
style = list( fontSize = "11px"),
columns = list(
`Codebook V2` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100)),
)
FAIR SALURBAL Database Summary
1. Source
1.1 Files
1.2 V2 Codebooks
SALURBAL codebooks will be updated to include system level metadata. Below compares v1 (old) and v2 (new) codebooks.
= tibble(
df_v1 field = codebook__bec %>% select(-file) %>% names(),
v1 = T
)
= tibble(
df_v2 field = codebook__air_pollution %>% select(-file,-v2_codebooks) %>% names(),
v2 = T
)
= codebook__air_pollution %>%
df_example filter(var_name == 'APSPM25MEAN') %>%
mutate_all(~as.character(.x)) %>%
pivot_longer(cols = everything(), names_to = 'field', values_to = 'example')
= df_v2 %>%
df_codebook_versions left_join(df_v1) %>%
left_join(df_example) %>%
select(Field = field, v1, v2, `v2 example` = example) %>%
mutate(v1 = ifelse(is.na(v1),F, v1))
%>%
df_codebook_versions reactable(
defaultPageSize = 20,
style = list( fontSize = "11px"),
columns = list(
`v1` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100),
`v2` = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100))
)
1.3 Stratification Scheme
The table below defines how we will organiation stratification within the data. There are three columns: 1) attribute - grouping varaible (e.g. Sex) 2) attribute_value are the actual strata (e.g. Female, Males…) 3. default is to signal which strata to display in a non-stratified context (e.g. data catalog table or city profiles viz)
%>%
df_strata select(-default) %>%
arrange(attribute) %>%
reactable(groupBy = "attribute")
2. Quality Control
2.1 Tests
The below displays the results of our individual QC tests. If the test has failed you can click each row to see the a summary of the variables that failed.
= ls(pattern = 'test_')
tests = map_df(tests, ~{
results = get(.x)(compiled_datasets,compiled_codebooks,role = 'test')
result tibble(
Test = result$test,
Result = result$pass,
Description = result$desc,
faulty = list(result$faulty))})%>%
arrange(desc(Result))
= which(!results$Result)
index_faulty %>%
results select(-faulty) %>%
reactable(
columns = list(
Test = colDef(width = 100),
Result = colDef(
cell = function(value) qc_icon_formatter(value),
width = 100)),
details = function(index){
if (index%in%index_faulty){
<- results %>% slice(index) %>% pull(faulty) %>% .[[1]]
nested_data = nested_data %>% count(var_name) %>% pull(n)>1
is_nestable = nested_data %>% count(var_name) %>% pull(var_name)
index_name = which(is_nestable)
index_nestable if (any(is_nestable)){
= nested_data %>%
nested_data1select(file, var_name) %>%
distinct()
else {
} = nested_data
nested_data1
}::div(style = "padding: 16px",
htmltoolsreactable(nested_data1,
outlined = TRUE,
resizable = TRUE,
wrap = FALSE,
bordered = TRUE,
details = function(index){
if (index%in%index_nestable){
= nested_data %>%
data_nested2 filter(var_name == index_name[index]) %>%
select(-file, -var_name)
::div(style = "padding: 16px",
htmltoolsreactable(data_nested2,
resizable = TRUE,
bordered = F,
wrap = FALSE)) }
}) ) } })
2.2 Generate Outputs
We keep subset data to those that have passed all QC tests then merge with codebook metadata for. This object ‘cleaned_datasets’ will be used for downstream datastore generation.
## Get outputs from each test
= map(tests, ~{get(.x)(compiled_datasets,compiled_codebooks,role = 'output')})
outputs
## Keep only those rows that passed all tests and operationalize other columns
= purrr::reduce(outputs, dplyr::inner_join) %>%
cleaned_datasets select(-pass) %>%
## operationalize value_type (categorical/discrete/continuous)
group_by(var_name) %>%
group_modify(~{
= .x$value
values %>% mutate( value_type = case_when(
.x any(str_detect(values,"[:alpha:]"))~"categorical",
any(str_detect(values,"\\."))~"continuous",
any(str_detect(var_label,c("Number","Minimum",'Minimim',
"Maximum","GDP per capita",
'City foundation year')))~"continuous",
TRUE~"discrete"
))%>%
}) ungroup() %>%
## Add var_name_base_labels
mutate(var_name_base_label = case_when(
== 0 ~ NA_character_,
n_attr == 1 ~ attribute1,
n_attr == 2 ~ glue("{attribute2} ({attribute1_value})") %>% paste0()
n_attr %>%
)) select(var_name, var_raw, geo, salid1,country, city,year, value,value_type,
everything())
# {## Check value_type
# ## Distribution of value_type
# cleaned_datasets %>%
# count(var_name,value_type) %>%
# count(value_type)
# ## Get list of discrete variables
# cleaned_datasets %>%
# filter(value_type == 'discrete') %>%
# count(var_name, var_label) %>% View()
# }
#' Add longitudinal coding labels. This function will categorize variables based on
#' what year and country available. THere will be four categories:
#' 1. Longitudinal - when variable is longitudinal
#' 2. Completely non-longitudinal - when a variable only has a single year value.
#' 3. Functionally non-longitudinal - when a variable only has a single year value per country;
#' this is for when data source differ by a few years across country. For example life expectancy is
#' '2010-2014' for El Salvador but '2012-2016' for all other countries.
#' 4. Mixed by Country - some countries non-longitudinal but some countries have dta vailable for multiple years.
## Prepare longitudinal_status
{ = cleaned_datasets %>%
xwalk_years_available select(var_name, subdomain,longitudinal, country, year) %>%
distinct() %>%
# filter(!longitudinal) %>%
group_by(subdomain, var_name) %>%
group_modify(~{
# .x = cleaned_datasets %>%
# select(var_name, subdomain,longitudinal, country, year) %>%
# distinct() %>% filter(var_name == "LEALE")
# .x = cleaned_datasets %>%
# select(var_name, subdomain,longitudinal, country, year) %>%
# distinct() %>% filter(var_name == "CNSMIGR_FORBORN")
= .x %>% count(year)
years_all = unique(.x$year)
years = .x %>% count(country )
table_year_country = case_when(
statusTmp length(years)==1~ "Completely non-longitudinal",
all(table_year_country$n == 1) ~"Functionally non-longitudinal",
length(unique(.x$longitudinal))>1~"Mixed by Country",
unique(.x$longitudinal)~"Longitudinal",
TRUE~"Mixed by Country"
%>%
) unique()
## Check for non-numeric outliers
if (is.na( min(as.numeric(years)))|is.na( max(as.numeric(years)))){
stop("ERROR: Non-numeric year value!")
}
## standardize year values for 'functionally non-longitudinal values'
# year_min_tmp = ifelse(
# str_detect(statusTmp, 'non-longitudinal'),
# years_all %>% filter(n == max(n)) %>% slice(1) %>% pull(year) %>% as.numeric(),
# min(as.numeric(years))
# )
# year_max_tmp = ifelse(
# str_detect(statusTmp, 'non-longitudinal'),
# years_all %>% filter(n == max(n)) %>% slice(1) %>% pull(year)%>% as.numeric(),
# min(as.numeric(years))
# )
tibble(
n_years = length(years),
year_min = min(as.numeric(years)),
year_max = max(as.numeric(years)),
n_countries = length(unique(.x$country)),
chr_years = paste0(years, collapse = ', '),
longitudinal_status = statusTmp
%>%
) mutate(years = list(as.numeric(unique(.x$year))))
%>%
}) ungroup() %>%
select(var_name, longitudinal_status, n_years, years, year_min, year_max)
}
{= cleaned_datasets %>% select(var_merge, var_name_nested) %>% distinct() %>%
xwalk_var_name_nested filter(!is.na(var_merge))
}
= compiled_codebooks %>%
cleaned_codebooks filter(var_name%in%cleaned_datasets$var_name ) %>%
left_join(xwalk_years_available) %>%
left_join(xwalk_var_name_nested)
## Check for that nested var_name codebook entries have var_name_nested
= cleaned_codebooks %>%
checkCodebookNested add_count(var_name) %>%
select(var_name, var_merge, var_name_nested, n) %>%
filter(n>1) %>%
arrange(desc(n)) %>%
filter(is.na(var_merge)| is.na(var_name_nested)) %>%
nrow() > 0
if (checkCodebookNested){
stop("ERROR: Some neste codebook entries are missing var_merge/var_name_nested")
}
%>%
cleaned_datasets slice(1:10) %>%
reactable( resizable = TRUE, wrap = FALSE, bordered = TRUE)
2.3 Validation of Outputs
The another round of QC is run to validate the quality of our final output. The results of the validation are shown in the table below.
## Make sure that all tests pass
= map_df(tests, ~{
validaiton_results = get(.x)(cleaned_datasets,cleaned_codebooks,role = 'test')
result tibble(
Test = result$test,
Result = result$pass,
Description = result$desc,
faulty = list(result$faulty))})
%>%
validaiton_results select(-faulty) %>%
reactable(
columns = list(
Test = colDef(width = 100),
Result = colDef(cell = function(value) qc_icon_formatter(value), width = 100)
))
3. Database Summary
= cleaned_datasets %>% count(var_name) %>% nrow()
n_variables = nrow(df_sources) n_datasets
From 16 datasets/codebook pairs we have cleaned 194 variables. Below are some metrics which summarize what is available in our compiled database.
3.1 Variables Count
How many variables for each dataset?
## Sankey
= cleaned_datasets %>%
df_sankey_count count(domain,subdomain, var_name) %>%
mutate(all = glue("Cleaned Variables ({n_variables})")) %>%
select(all,domain, subdomain)
hchart(data_to_sankey(df_sankey_count), "sankey", name = "Variable Count")
3.2 Stratification
How many is our data stratified?
= cleaned_datasets %>%
df2a count(var_name, subdomain, domain, attribute1) %>%
mutate(attribute1 = ifelse(is.na(attribute1),"Not Stratified", "Stratification Available")) %>%
select(stratified = attribute1, domain )
# hchart(data_to_sankey(df2a), "sankey", name = "Variable Count")
= cleaned_datasets %>%
table_strata_by_file count(domain, var_name, attribute1, attribute2) %>%
mutate(stratified = ifelse(!is.na(attribute1), 'Available', 'Not Available')) %>%
count(domain,stratified) %>%
## Order domain
group_by(domain) %>%
mutate(n_domain = sum(n))%>%
ungroup() %>%
mutate(domain = fct_reorder(as.factor(domain), (n_domain)))
## b. Examine strata levels
{
## All strata vars
= cleaned_datasets %>%
dfa count(file, var_name, attribute1, attribute2) %>%
filter(!is.na(attribute1))%>%
select(-n)
## Single attribute variables
= dfa %>%
df1 filter(is.na(attribute2)) %>%
left_join(cleaned_datasets) %>%
count(domain, var_name, attribute1,attribute1_value)
# df1
## Double attribute variables
= dfa %>%
df2 filter(!is.na(attribute2)) %>%
left_join(cleaned_datasets) %>%
count(domain, var_name,
attribute1,attribute1_value,
attribute2,attribute2_value )# df2
}
## Plot
= cleaned_datasets %>%
dfb mutate(stratified2 =case_when(
is.na(attribute1)~'Not Stratified',
is.na(attribute2)~'Single Stratification',
TRUE~'Double Stratification'
%>%
) )count(domain,var_name,stratified2,attribute1, attribute2) %>%
rowwise() %>%
mutate(attribute = paste(attribute1,attribute2, sep = '+') %>%
str_remove_all('NA') %>%
str_trim() %>%
ifelse(str_sub(.,-1L)=='+',str_sub(.,0,-1L-1),. ) %>%
ifelse(.=='',NA,.),
stratified = ifelse(stratified2=='Not Stratified','Not Stratified','Stratified')) %>%
ungroup() %>%
select(domain,stratified2, attribute)
hchart(data_to_sankey(dfb), "sankey", name = "Stratification Count")
## Operationalize statistics
{
= table_strata_by_file %>% filter(stratified == "Available") %>% pull(n) %>% sum()
n_var_with_strata = table_strata_by_file %>% filter(stratified != "Available") %>% pull(n) %>% sum()
n_var_without_strata = df1 %>% count(var_name) %>% nrow()
n_var_with_1_strata = df2 %>% count(var_name) %>% nrow()
n_var_with_2_strata }
Out of 194 variables, only 23 variables are have stratification available. The different stratas available cane be seen below.
3.3 Value Type
What type of data is available?
= cleaned_datasets %>%
dfa count(var_name, value_type) %>%
count(value_type)
%>%
dfa reactable(
details = function(index){
= cleaned_datasets %>%
dfa_tmp count(var_name, var_label, value_type) %>%
filter(value_type ==dfa %>% slice(index) %>% pull(value_type) )
::div(style = "padding: 16px",
htmltoolsreactable(dfa_tmp,
outlined = TRUE,
resizable = TRUE,
wrap = FALSE,
bordered = TRUE) ) }
)
3.4 Longitudinal Data
EDA of longitudinality in SALURBAL data. Note that whether variables are longitudinal is not available in v1 codebook. pending v2 codebooks they will be assigned by groups themselves and thus should be more accurate. Right now only 9 variables from v2 codebooks (Air Pollution have DMC assigned longitudinal value).
%>%
cleaned_codebooks count(var_name, longitudinal) %>%
tabyl(longitudinal) %>%
reactable()
But we can back-calculate longitudinality from the data it self, for now we will use these ‘longitudinal_status’ as a metadata field for our datastore. Roughly 65% of the variables are not longitudinal. Here we document the four possible cases of longitudinality which were coded for each variable based on year availability by year.
%>%
cleaned_codebooks select(var_name, subdomain,longitudinal_status) %>%
distinct() %>%
tabyl(longitudinal_status) %>%
arrange(desc(percent)) %>%
reactable()
Longitudinal is when variable is longitudinal
Completely non-longitudinal is when a variable only has a single year value.
Functionally non-longitudinal is when a variable only has a single year value per country; this is for when data source differ by a few years across country. For example life expectancy is ‘2010-2014’ for El Salvador but ‘2012-2016’ for all other countries.
Mixed When the variable identified as non-longitudinal but does not fall into the expected two cases above. It has single years for some countries but multiple years for other countries. These come from two subdomains 1) migration and 2) segregation. The distribution of data availabl by year and country for these two datasets are displayed below.
Segregation
Migration
Moving forward we need to to make some decisions about how to address each of these cases during the check out selection and downloads steps.