In [1]:
import sys
import os
from pathlib import Path
import narwhals as nw
import polars as pl

from survey_kit.utilities.dataframe import summary
from survey_kit.imputation.srmi import SRMI
from survey_kit import logger, config

from survey_kit.statistics.multiple_imputation import MultipleImputation, mi_ses_from_function
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import safe_height
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed
In [2]:
logger.info("Load the SRMI results from that tutorial")
path_model = f"{config.path_temp_files}/py_srmi_test_gbm"
srmi = SRMI.load(path_model)
df_implicates = srmi.df_implicates
Load the SRMI results from that tutorial
In [3]:
logger.info("Draw bootstrap weights")
set_seed(8345)
n_rows = safe_height(df_implicates[0])
n_replicates = 10

df_weights = (
        bayes_bootstrap(
        n_rows=n_rows,
        n_draws=n_replicates+1,
        seed=generate_seed(),
        prefix="weight_",
        initial_weight_index=0
    )
    .with_row_index("index")
)
Draw bootstrap weights
In [4]:
logger.info("What's the data look like")
_ = df_implicates.pipe(summary)
_ = summary(df_weights)
What's the data look like
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm3 ┆ 10,000 ┆       1,212 ┆  -2.721713 ┆   10.901329 ┆ -55.354108 ┆    26.213084 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.361046 ┆   10.164599 ┆ -55.354108 ┆    26.213084 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5278 ┆    0.499252 ┆        0.0 ┆          1.0 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.361046 ┆   10.164599 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 108.883286 ┆   269.40042 ┆        0.0 ┆ 3,064.077326 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm3 ┆ 10,000 ┆       1,219 ┆  -2.692408 ┆   10.886586 ┆ -55.354108 ┆    26.213084 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.299835 ┆   10.146886 ┆ -55.354108 ┆    26.213084 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5284 ┆    0.499218 ┆        0.0 ┆          1.0 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.299835 ┆   10.146886 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 108.238244 ┆  271.231778 ┆        0.0 ┆ 3,064.077326 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘
┌───────────┬────────┬─────────────┬──────────┬─────────────┬──────────┬───────────┐
│  Variable ┆      n ┆ n (missing) ┆     mean ┆         std ┆      min ┆       max │
╞═══════════╪════════╪═════════════╪══════════╪═════════════╪══════════╪═══════════╡
│     index ┆ 10,000 ┆           0 ┆  4,999.5 ┆ 2,886.89568 ┆      0.0 ┆   9,999.0 │
│  weight_0 ┆ 10,000 ┆           0 ┆ 1.010169 ┆    1.023118 ┆ 0.000036 ┆ 10.815955 │
│  weight_1 ┆ 10,000 ┆           0 ┆ 1.004312 ┆    0.993021 ┆ 0.000056 ┆  9.189042 │
│  weight_2 ┆ 10,000 ┆           0 ┆ 0.998768 ┆    1.004974 ┆ 0.000021 ┆  8.742044 │
│  weight_3 ┆ 10,000 ┆           0 ┆ 1.007674 ┆    1.005936 ┆ 0.000021 ┆  9.887328 │
│  weight_4 ┆ 10,000 ┆           0 ┆ 0.994573 ┆    0.994362 ┆ 0.000085 ┆ 10.624148 │
│  weight_5 ┆ 10,000 ┆           0 ┆ 0.993123 ┆    0.989687 ┆ 0.000082 ┆  8.626601 │
│  weight_6 ┆ 10,000 ┆           0 ┆ 1.009294 ┆    1.007408 ┆ 0.000073 ┆ 10.097413 │
│  weight_7 ┆ 10,000 ┆           0 ┆ 0.976332 ┆    0.965082 ┆ 0.000392 ┆  9.029346 │
│  weight_8 ┆ 10,000 ┆           0 ┆ 1.006129 ┆    1.000921 ┆ 0.000003 ┆  9.427705 │
│  weight_9 ┆ 10,000 ┆           0 ┆ 1.002251 ┆    0.997762 ┆ 0.000133 ┆ 10.577949 │
│ weight_10 ┆ 10,000 ┆           0 ┆ 1.018912 ┆    1.016059 ┆ 0.000344 ┆ 12.969568 │
└───────────┴────────┴─────────────┴──────────┴─────────────┴──────────┴───────────┘
In [5]:
logger.info("What statistics do I want:")
logger.info("   In this case, the mean of all variables that start with var_")
stats = Statistics(
    stats=["mean"],
    columns="var_*",
)
What statistics do I want:
   In this case, the mean of all variables that start with var_
In [6]:
logger.info("Define the 'replicate' object, which tell is what the weight variables are")
replicates = Replicates(weight_stub="weight_", n_replicates=n_replicates)
Define the 'replicate' object, which tell is what the weight variables are
In [7]:
logger.info("Arguments that are getting passed to StatCalculator at each run")
arguments = dict(
    statistics=stats,
    replicates=replicates
)
Arguments that are getting passed to StatCalculator at each run
In [8]:
logger.info("Get the multiple imputation standard errofs by calling StatCalculator")
logger.info("   for each implicate and each replicate factor")
logger.info("   If you had 100 bootstrap weights and 5 imputation draws (implicates)")
logger.info("   the StatCalculator calculation would run 5*100=500 times")
mi_results_seq = mi_ses_from_function(
    delegate=StatCalculator,
    df_implicates=df_implicates,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments,
    join_on=["Variable"],
    parallel=False,
)
Get the multiple imputation standard errofs by calling StatCalculator
   for each implicate and each replicate factor
   If you had 100 bootstrap weights and 5 imputation draws (implicates)
   the StatCalculator calculation would run 5*100=500 times
Implicate #1
0
.
.
.
.
5
.
.
.
.
10
┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm3 ┆  -2.549985 │
│             ┆   0.396227 │
│    var_gbm2 ┆  -2.161733 │
│             ┆   0.423325 │
│    var_gbm1 ┆   0.521376 │
│             ┆   0.017829 │
│   var_gbm12 ┆  -2.161733 │
│             ┆   0.423325 │
│ var_gbm2_sq ┆ 107.251698 │
│             ┆   6.245331 │
└─────────────┴────────────┘
Implicate #2
0
.
.
.
.
5
.
.
.
.
10
┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm3 ┆   -2.49392 │
│             ┆   0.431025 │
│    var_gbm2 ┆  -2.083911 │
│             ┆   0.437701 │
│    var_gbm1 ┆   0.520732 │
│             ┆   0.021626 │
│   var_gbm12 ┆  -2.083911 │
│             ┆   0.437701 │
│ var_gbm2_sq ┆ 106.290287 │
│             ┆   6.783125 │
└─────────────┴────────────┘

In [9]:
logger.info("Do it again, but run each implicate in it's own process and collect the results")
logger.info("   This will split up the job and ")
logger.info("   use all your cpus (or what you set in config.cpus), dividing them up among the jobs")
mi_results = mi_ses_from_function(
    delegate=StatCalculator,
    df_implicates=df_implicates,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments,
    join_on=["Variable"],
    parallel=True,
)
Do it again, but run each implicate in it's own process and collect the results
   This will split up the job and 
   use all your cpus (or what you set in config.cpus), dividing them up among the jobs
Defaulting to shell for parallel estimation

Starting call:

Starting call at 0 seconds
      Because:    RunBecause.SetToRun
Function: _mi_ses_from_function_one_implicate
language: Languages.Python

BEGIN CODE

from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe')
_mi_ses_from_function_one_implicate(implicate_number=0,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe_implicate_0',**d_implicate)


END CODE



Starting call at 0 seconds
      Because:    RunBecause.SetToRun
Function: _mi_ses_from_function_one_implicate
language: Languages.Python

BEGIN CODE

from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe')
_mi_ses_from_function_one_implicate(implicate_number=1,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe_implicate_1',**d_implicate)


END CODE


.
.
.
.
.
.
.
.
.
.
.
.
.
LOGGING INFORMATION FOR:
    FILE:       C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files\001__mi_ses_from_function_one_implicate_utvdwij4.py
PRINT OUTPUT BEGIN



PRINT OUTPUT END


LOGGING INFORMATION FOR:
    FILE:       C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files\002__mi_ses_from_function_one_implicate_eynb07th.py
PRINT OUTPUT BEGIN



PRINT OUTPUT END



Complete=True
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe_implicate_0.replicate_stats
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe_implicate_1.replicate_stats
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/ac2i3ibe.dict
In [10]:
logger.info("The sequential (non-parallel) job results")
mi_results.print()
The sequential (non-parallel) job results
┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm3 ┆  -2.521952 │
│             ┆   0.416829 │
│    var_gbm2 ┆  -2.122822 │
│             ┆   0.435816 │
│    var_gbm1 ┆   0.521054 │
│             ┆   0.019827 │
│   var_gbm12 ┆  -2.122822 │
│             ┆   0.435816 │
│ var_gbm2_sq ┆ 106.770992 │
│             ┆   6.572724 │
└─────────────┴────────────┘
In [11]:
logger.info("The parallel job results")
mi_results_seq.print()
The parallel job results
┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm3 ┆  -2.521952 │
│             ┆   0.416829 │
│    var_gbm2 ┆  -2.122822 │
│             ┆   0.435816 │
│    var_gbm1 ┆   0.521054 │
│             ┆   0.019827 │
│   var_gbm12 ┆  -2.122822 │
│             ┆   0.435816 │
│ var_gbm2_sq ┆ 106.770992 │
│             ┆   6.572724 │
└─────────────┴────────────┘
In [12]:
logger.info("Save them for later")
mi_results.save(f"{config.path_temp_files}/mi_sequential")
mi_results_seq.save(f"{config.path_temp_files}/mi_parallel")

del mi_results
del mi_results_seq
Save them for later
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/mi_sequential.mi
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/mi_parallel.mi
In [13]:
logger.info("Load them back up")
mi_results = MultipleImputation.load(f"{config.path_temp_files}/mi_sequential")
mi_results_seq = MultipleImputation.load(f"{config.path_temp_files}/mi_parallel")
Load them back up
In [14]:
logger.info("Compare them")
d_comparison = mi_results.compare(mi_results_seq)
logger.info("   Print the mi_results_seq - mi_results")
d_comparison["difference"].print()
logger.info("   Print the (mi_results_seq-mi_results)/mi_results")
d_comparison["ratio"].print()
Compare them
   Print the mi_results_seq - mi_results
┌─────────────┬──────┐
│    Variable ┆ mean │
╞═════════════╪══════╡
│    var_gbm3 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm2 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm1 ┆  0.0 │
│             ┆  0.0 │
│   var_gbm12 ┆  0.0 │
│             ┆  0.0 │
│ var_gbm2_sq ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘
   Print the (mi_results_seq-mi_results)/mi_results
┌─────────────┬──────┐
│    Variable ┆ mean │
╞═════════════╪══════╡
│    var_gbm3 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm2 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm1 ┆  0.0 │
│             ┆  0.0 │
│   var_gbm12 ┆  0.0 │
│             ┆  0.0 │
│ var_gbm2_sq ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘