--- title: "SamplingStrata workflow with drake" subtitle: "Continuous method" author: "Giulio Barcaroli" date: "2025-03-02" # output: rmarkdown::html_vignette output: bookdown::html_document2: df_print: kable highlight: tango number_sections: yes theme: flatly toc: yes toc_depth: 2 toc_float: collapsed: no smooth_scroll: yes toc_title: spsur vs spse # code_folding: hide vignette: > %\VignetteIndexEntry{SamplingStrata workflow with drake} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} \usepackage[utf8]{inputenc} --- # Workflow definition First we define packages: ``` r library(drake) library(SamplingStrata) ``` then 'swissmunicipalities' example data (with only 2 domains): ``` r data(swissmunicipalities) swissmun <- swissmunicipalities[swissmunicipalities$REG < 3, c("REG","COM","Nom","HApoly", "Surfacesbois","Surfacescult", "Airbat","POPTOT")] head(swissmun) #> REG COM Nom HApoly Surfacesbois Surfacescult Airbat POPTOT #> 2 1 6621 Geneve 1593 67 31 773 177964 #> 4 2 351 Bern 5162 1726 1041 1070 128634 #> 5 1 5586 Lausanne 4136 1635 714 856 124914 #> 9 2 371 Biel (BE) 2123 976 196 463 48655 #> 10 2 942 Thun 2158 425 694 523 40377 #> 11 2 355 Koniz 5099 1567 2621 515 37782 ``` Finally, the plan: ``` r plan <- drake_plan( # Definition of the sampling frame frame = buildFrameDF( df = swissmun, id = "COM", domainvalue= "REG", X = c("HApoly","POPTOT"), Y =c("Surfacesbois", "Airbat")), # Definition of precision constraints cv = as.data.frame(list( DOM = rep("DOM1",length(unique(swissmun$REG))), CV1 = rep(0.10,length(unique(swissmun$REG))), CV2 = rep(0.10,length(unique(swissmun$REG))), domainvalue= c(1:length(unique(swissmun$REG))))), # Solution with kmeans kmean = KmeansSolution2(frame=frame, errors=cv, maxclusters=5), # Determination of number of strata for each domain nstrat = tapply(kmean$suggestions, kmean$domainvalue, FUN=function(x) length(unique(x))), # Optimisation solution = optimStrata(method="continuous", framesamp=frame, errors= cv, nStrat=nstrat, iter = 25, pops= 10), # Optimal strata strata_structure = summaryStrata(solution$framenew, solution$aggr_strata, progress=FALSE), # Expected CVs with optimal strata expected_CV1 = expected_CV(solution$aggr_strata), # Adjust allocation with affordable sample size sample_size = 150, adjustedStrata = adjustSize(size=sample_size,strata=solution$aggr_strata,cens=NULL), # New expected CVs expected_CV2 = expected_CV(adjustedStrata), # Evaluation by simulation eval = evalSolution(frame = solution$framenew, outstrata = adjustedStrata, nsampl = 500, progress = FALSE), # Final selection of the sample sample = selectSample(solution$framenew,adjustedStrata) ) ``` # Execution of the workflow We have just defined this plan: ``` r print(plan) #> # A tibble: 12 × 2 #> target command #> #> 1 frame buildFrameDF(df = swissmun, id = "COM", domainvalue = "REG", X = c("HApoly", "POPTOT"), Y = c("Surfacesbois", "Airbat")) #> 2 cv as.data.frame(list(DOM = rep("DOM1", length(unique(swissmun$REG))), CV1 = rep(0.1, length(unique(swissmun$REG))), CV2 = rep(0.1, length(unique(swissmun$REG))), domainvalue = c(1:length(unique(swissmun$REG))))) #> 3 kmean KmeansSolution2(frame = frame, errors = cv, maxclusters = 5) #> 4 nstrat tapply(kmean$suggestions, kmean$domainvalue, FUN = function(x) length(unique(x))) #> 5 solution optimStrata(method = "continuous", framesamp = frame, errors = cv, nStrat = nstrat, iter = 25, pops = 10) #> 6 strata_structure summaryStrata(solution$framenew, solution$aggr_strata, progress = FALSE) #> 7 expected_CV1 expected_CV(solution$aggr_strata) #> 8 sample_size 150 #> 9 adjustedStrata adjustSize(size = sample_size, strata = solution$aggr_strata, cens = NULL) #> 10 expected_CV2 expected_CV(adjustedStrata) #> 11 eval evalSolution(frame = solution$framenew, outstrata = adjustedStrata, nsampl = 500, progress = FALSE) #> 12 sample selectSample(solution$framenew, adjustedStrata) ``` visualised in this way: ``` r vis_drake_graph(plan, ncol_legend = NULL, main="Optimization with 'continuous' method") #> ℹ unloading 3 targets from environment ``` ![plot of chunk graph](figure/graph-1.png) and now we execute: ``` r make(plan) #> ✔ All targets are already up to date. ``` # Output inspection First of all, the total size of the optimized sample: ``` r options(width = 999) loadd(solution) sum(solution$aggr_strata$SOLUZ) #> [1] 97.61437 ``` Then, the structure of the optimized strata: ``` r loadd(strata_structure) strata_structure #> Domain Stratum Population Allocation SamplingRate Lower_X1 Upper_X1 Lower_X2 Upper_X2 #> 1 1 1 369 15 0.040455 32 1212 27 2171 #> 2 1 2 89 12 0.137139 129 1604 195 10955 #> 3 1 3 118 24 0.203822 239 9692 78 29559 #> 4 1 4 9 3 0.324100 9922 20994 255 7515 #> 5 1 5 4 3 0.778272 1593 28225 5988 177964 #> 6 2 1 544 10 0.019213 34 972 30 1841 #> 7 2 2 277 13 0.045605 152 2417 84 6798 #> 8 2 3 55 6 0.109353 421 4609 332 16757 #> 9 2 4 37 11 0.305442 930 20079 272 128634 ``` The expected CVs given the optimized strata: ``` r loadd(expected_CV1) expected_CV1 #> cv(Y1) cv(Y2) #> DOM1 0.1 0.1 #> DOM2 0.1 0.1 ``` and after the adjustment of the allocation in the strata based on the affordable sample size: ``` r loadd(expected_CV2) expected_CV2 #> cv(Y1) cv(Y2) #> DOM1 0.0762015 0.0755376 #> DOM2 0.0799250 0.0780134 ``` The results of simulation with the adjusted allocation: ``` r loadd(eval) eval$coeff_var #> CV1 CV2 dom #> 1 0.0762 0.0771 DOM1 #> 2 0.0828 0.0797 DOM2 eval$rel_bias #> y1 y2 dom #> 1 0.0019 0.0045 DOM1 #> 2 -0.0097 0.0019 DOM2 ``` Finally, the selection of the sample from the adjusted strata: ``` r loadd(sample) head(sample) #> DOMAINVALUE STRATO ID X1 X2 Y1 Y2 LABEL WEIGHTS FPC #> 1 1 1 5494 1108 828 411 24 1 16.04348 0.06233062 #> 2 1 1 5527 368 888 73 36 1 16.04348 0.06233062 #> 3 1 1 6061 502 290 194 13 1 16.04348 0.06233062 #> 4 1 1 5923 360 170 100 9 1 16.04348 0.06233062 #> 5 1 1 5501 385 813 49 28 1 16.04348 0.06233062 #> 6 1 1 5858 269 322 34 9 1 16.04348 0.06233062 ```