import sys
import os
from pathlib import Path
import narwhals as nw
import polars as pl

from survey_kit.utilities.dataframe import summary
from survey_kit.imputation.srmi import SRMI
from survey_kit import logger, config

from survey_kit.statistics.multiple_imputation import (
    MultipleImputation,
    mi_ses_from_function,
)
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import safe_height
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed

logger.info("Load the SRMI results from that tutorial")
path_model = f"{config.path_temp_files}/py_srmi_test_gbm"
srmi = SRMI.load(path_model)
df_implicates = srmi.df_implicates

Load the SRMI results from that tutorial

logger.info("Draw bootstrap weights")
set_seed(8345)
n_rows = safe_height(df_implicates[0])
n_replicates = 10

df_weights = bayes_bootstrap(
    n_rows=n_rows,
    n_draws=n_replicates + 1,
    seed=generate_seed(),
    prefix="weight_",
    initial_weight_index=0,
).with_row_index("index")

Draw bootstrap weights

logger.info("What's the data look like")
_ = df_implicates.pipe(summary)
_ = summary(df_weights)

What's the data look like

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.405997 ┆   10.199102 ┆ -55.354108 ┆    26.213084 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.362588 ┆    10.14679 ┆ -55.354108 ┆    26.213084 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.405997 ┆   10.199102 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 109.800094 ┆  271.148843 ┆        0.0 ┆ 3,064.077326 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5272 ┆    0.499285 ┆        0.0 ┆          1.0 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.334157 ┆   10.148004 ┆ -55.354108 ┆    26.213084 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.322527 ┆   10.098587 ┆ -55.354108 ┆    26.213084 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.334157 ┆   10.148004 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 108.419985 ┆  266.593764 ┆        0.0 ┆ 3,064.077326 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5276 ┆    0.499263 ┆        0.0 ┆          1.0 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘

┌───────────┬────────┬─────────────┬──────────┬─────────────┬──────────┬───────────┐
│  Variable ┆      n ┆ n (missing) ┆     mean ┆         std ┆      min ┆       max │
╞═══════════╪════════╪═════════════╪══════════╪═════════════╪══════════╪═══════════╡
│     index ┆ 10,000 ┆           0 ┆  4,999.5 ┆ 2,886.89568 ┆      0.0 ┆   9,999.0 │
│  weight_0 ┆ 10,000 ┆           0 ┆ 1.010169 ┆    1.023118 ┆ 0.000036 ┆ 10.815955 │
│  weight_1 ┆ 10,000 ┆           0 ┆ 1.004312 ┆    0.993021 ┆ 0.000056 ┆  9.189042 │
│  weight_2 ┆ 10,000 ┆           0 ┆ 0.998768 ┆    1.004974 ┆ 0.000021 ┆  8.742044 │
│  weight_3 ┆ 10,000 ┆           0 ┆ 1.007674 ┆    1.005936 ┆ 0.000021 ┆  9.887328 │
│  weight_4 ┆ 10,000 ┆           0 ┆ 0.994573 ┆    0.994362 ┆ 0.000085 ┆ 10.624148 │
│  weight_5 ┆ 10,000 ┆           0 ┆ 0.993123 ┆    0.989687 ┆ 0.000082 ┆  8.626601 │
│  weight_6 ┆ 10,000 ┆           0 ┆ 1.009294 ┆    1.007408 ┆ 0.000073 ┆ 10.097413 │
│  weight_7 ┆ 10,000 ┆           0 ┆ 0.976332 ┆    0.965082 ┆ 0.000392 ┆  9.029346 │
│  weight_8 ┆ 10,000 ┆           0 ┆ 1.006129 ┆    1.000921 ┆ 0.000003 ┆  9.427705 │
│  weight_9 ┆ 10,000 ┆           0 ┆ 1.002251 ┆    0.997762 ┆ 0.000133 ┆ 10.577949 │
│ weight_10 ┆ 10,000 ┆           0 ┆ 1.018912 ┆    1.016059 ┆ 0.000344 ┆ 12.969568 │
└───────────┴────────┴─────────────┴──────────┴─────────────┴──────────┴───────────┘

logger.info("What statistics do I want:")
logger.info("   In this case, the mean of all variables that start with var_")
stats = Statistics(
    stats=["mean"],
    columns="var_*",
)

What statistics do I want:

   In this case, the mean of all variables that start with var_

logger.info(
    "Define the 'replicate' object, which tell is what the weight variables are"
)
replicates = Replicates(weight_stub="weight_", n_replicates=n_replicates)

Define the 'replicate' object, which tell is what the weight variables are

logger.info("Arguments that are getting passed to StatCalculator at each run")
arguments = dict(statistics=stats, replicates=replicates)

Arguments that are getting passed to StatCalculator at each run

logger.info("Get the multiple imputation standard errofs by calling StatCalculator")
logger.info("   for each implicate and each replicate factor")
logger.info("   If you had 100 bootstrap weights and 5 imputation draws (implicates)")
logger.info("   the StatCalculator calculation would run 5*100=500 times")
mi_results_seq = mi_ses_from_function(
    delegate=StatCalculator,
    df_implicates=df_implicates,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments,
    join_on=["Variable"],
    parallel=False,
)

Get the multiple imputation standard errofs by calling StatCalculator

   for each implicate and each replicate factor

   If you had 100 bootstrap weights and 5 imputation draws (implicates)

   the StatCalculator calculation would run 5*100=500 times

Implicate #1

0

.

.

.

.

logger.info(
    "Do it again, but run each implicate in it's own process and collect the results"
)
logger.info("   This will split up the job and ")
logger.info(
    "   use all your cpus (or what you set in config.cpus), dividing them up among the jobs"
)
mi_results = mi_ses_from_function(
    delegate=StatCalculator,
    df_implicates=df_implicates,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments,
    join_on=["Variable"],
    parallel=True,
)

Do it again, but run each implicate in it's own process and collect the results

   This will split up the job and

   use all your cpus (or what you set in config.cpus), dividing them up among the jobs

Defaulting to shell for parallel estimation


Starting call:

Starting call at 0 seconds

      Because:    RunBecause.SetToRun

Function: _mi_ses_from_function_one_implicate
language: Languages.Python

BEGIN CODE

from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/wsrqn3wx')
_mi_ses_from_function_one_implicate(implicate_number=0,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/wsrqn3wx_implicate_0',**d_implicate)


END CODE

logger.info("The sequential (non-parallel) job results")
mi_results.print()

The sequential (non-parallel) job results

┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm2 ┆  -2.179618 │
│             ┆   0.388301 │
│    var_gbm3 ┆  -2.177784 │
│             ┆   0.359662 │
│   var_gbm12 ┆  -2.179618 │
│             ┆   0.388301 │
│ var_gbm2_sq ┆ 107.157191 │
│             ┆   6.795533 │
│    var_gbm1 ┆   0.520542 │
│             ┆   0.019674 │
└─────────────┴────────────┘

logger.info("The parallel job results")
mi_results_seq.print()

The parallel job results

┌─────────────┬────────────┐
│    Variable ┆       mean │
╞═════════════╪════════════╡
│    var_gbm2 ┆  -2.179618 │
│             ┆   0.388301 │
│    var_gbm3 ┆  -2.177784 │
│             ┆   0.359662 │
│   var_gbm12 ┆  -2.179618 │
│             ┆   0.388301 │
│ var_gbm2_sq ┆ 107.157191 │
│             ┆   6.795533 │
│    var_gbm1 ┆   0.520542 │
│             ┆   0.019674 │
└─────────────┴────────────┘

logger.info("Save them for later")
mi_results.save(f"{config.path_temp_files}/mi_sequential")
mi_results_seq.save(f"{config.path_temp_files}/mi_parallel")

del mi_results
del mi_results_seq

Save them for later

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/mi_sequential.mi

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/mi_parallel.mi

logger.info("Load them back up")
mi_results = MultipleImputation.load(f"{config.path_temp_files}/mi_sequential")
mi_results_seq = MultipleImputation.load(f"{config.path_temp_files}/mi_parallel")

Load them back up

logger.info("Compare them")
d_comparison = mi_results.compare(mi_results_seq)
logger.info("   Print the mi_results_seq - mi_results")
d_comparison["difference"].print()
logger.info("   Print the (mi_results_seq-mi_results)/mi_results")
d_comparison["ratio"].print()

Compare them

   Print the mi_results_seq - mi_results

┌─────────────┬──────┐
│    Variable ┆ mean │
╞═════════════╪══════╡
│    var_gbm2 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm3 ┆  0.0 │
│             ┆  0.0 │
│   var_gbm12 ┆  0.0 │
│             ┆  0.0 │
│ var_gbm2_sq ┆  0.0 │
│             ┆  0.0 │
│    var_gbm1 ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘

   Print the (mi_results_seq-mi_results)/mi_results

┌─────────────┬──────┐
│    Variable ┆ mean │
╞═════════════╪══════╡
│    var_gbm2 ┆  0.0 │
│             ┆  0.0 │
│    var_gbm3 ┆  0.0 │
│             ┆  0.0 │
│   var_gbm12 ┆  0.0 │
│             ┆  0.0 │
│ var_gbm2_sq ┆  0.0 │
│             ┆  0.0 │
│    var_gbm1 ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘