from __future__ import annotations

import polars as pl

from survey_kit.utilities.dataframe import summary, join_wrapper
from survey_kit.imputation.srmi import SRMI
from survey_kit import logger, config

from survey_kit.statistics.multiple_imputation import mi_ses_from_function
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.replicates import Replicates
from survey_kit.utilities.dataframe import safe_height
from survey_kit.statistics.bootstrap import bayes_bootstrap

logger.info("Load the data from the SRMI tutorial")
path_model = f"{config.path_temp_files}/py_srmi_test_gbm"
srmi = SRMI.load(path_model)
df_implicates = srmi.df_implicates

Load the data from the SRMI tutorial

logger.info("Make random weights for the bootstrap")
n_rows = safe_height(df_implicates[0])
n_replicates = 10

df_weights = bayes_bootstrap(
    n_rows=n_rows,
    n_draws=n_replicates + 1,
    seed=8345,
    prefix="weight_",
    initial_weight_index=0,
).with_row_index("index")

Make random weights for the bootstrap

logger.info("What's the data look like")
logger.info("   The SRMI data - both implicates")
_ = df_implicates.pipe(summary)
logger.info("   The weights")
_ = summary(df_weights)

What's the data look like

   The SRMI data - both implicates

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.405997 ┆   10.199102 ┆ -55.354108 ┆    26.213084 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.362588 ┆    10.14679 ┆ -55.354108 ┆    26.213084 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.405997 ┆   10.199102 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 109.800094 ┆  271.148843 ┆        0.0 ┆ 3,064.077326 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5272 ┆    0.499285 ┆        0.0 ┆          1.0 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆          max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆      9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆      2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆         12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆         10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆         50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆     0.999997 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆     0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆      0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆     0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆     0.999867 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆     0.999997 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.334157 ┆   10.148004 ┆ -55.354108 ┆    26.213084 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.322527 ┆   10.098587 ┆ -55.354108 ┆    26.213084 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.334157 ┆   10.148004 ┆ -55.354108 ┆    26.213084 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆ 108.419985 ┆  266.593764 ┆        0.0 ┆ 3,064.077326 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆          1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆     0.5276 ┆    0.499263 ┆        0.0 ┆          1.0 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘

   The weights

┌───────────┬────────┬─────────────┬──────────┬─────────────┬──────────┬───────────┐
│  Variable ┆      n ┆ n (missing) ┆     mean ┆         std ┆      min ┆       max │
╞═══════════╪════════╪═════════════╪══════════╪═════════════╪══════════╪═══════════╡
│     index ┆ 10,000 ┆           0 ┆  4,999.5 ┆ 2,886.89568 ┆      0.0 ┆   9,999.0 │
│  weight_0 ┆ 10,000 ┆           0 ┆ 0.996552 ┆    0.988544 ┆ 0.000027 ┆  9.548572 │
│  weight_1 ┆ 10,000 ┆           0 ┆ 0.994871 ┆    1.013359 ┆ 0.000087 ┆   9.40968 │
│  weight_2 ┆ 10,000 ┆           0 ┆ 1.013333 ┆    0.990934 ┆ 0.000017 ┆ 10.246139 │
│  weight_3 ┆ 10,000 ┆           0 ┆ 1.015993 ┆    1.017586 ┆ 0.000047 ┆ 10.149365 │
│  weight_4 ┆ 10,000 ┆           0 ┆ 1.000038 ┆    1.003878 ┆  0.00001 ┆  8.684298 │
│  weight_5 ┆ 10,000 ┆           0 ┆ 1.015135 ┆    1.023968 ┆ 0.000229 ┆  9.941725 │
│  weight_6 ┆ 10,000 ┆           0 ┆ 1.001943 ┆    0.990688 ┆ 0.000108 ┆  9.909087 │
│  weight_7 ┆ 10,000 ┆           0 ┆ 1.003338 ┆    0.992315 ┆  0.00015 ┆ 10.318354 │
│  weight_8 ┆ 10,000 ┆           0 ┆ 1.000687 ┆    1.001554 ┆ 0.000318 ┆  11.09299 │
│  weight_9 ┆ 10,000 ┆           0 ┆ 0.991478 ┆    1.005367 ┆ 0.000052 ┆  8.802607 │
│ weight_10 ┆ 10,000 ┆           0 ┆ 0.983393 ┆    0.980691 ┆ 0.000005 ┆  8.388088 │
└───────────┴────────┴─────────────┴──────────┴─────────────┴──────────┴───────────┘

logger.info("To run an arbitrary stat, you need")
logger.info(
    "   any function that takes a dataframe and a weight and returns a dataframe"
)
logger.info("The function will be run for each boostrap/replicate weight")
logger.info("   Note: you may need to import packages within the function")
logger.info("   if executed in parallel (it will get pickled and reloaded)")


def run_regression(
    df: pl.LazyFrame | pl.DataFrame, y: str, X: list[str], weight: str = ""
) -> pl.LazyFrame | pl.DataFrame:
    import polars as pl
    from sklearn.linear_model import LinearRegression

    df = df.lazy().collect()
    model = LinearRegression()
    d_weight = {}
    if weight != "":
        d_weight["sample_weight"] = df[weight]

    model.fit(X=df.select(X), y=df.select(y), **d_weight)

    df_betas = pl.DataFrame(
        dict(
            Variable=df.select(X).lazy().collect_schema().names() + ["_Intercept_"],
            Beta=[
                float(vali)
                for vali in [float(coefi) for coefi in model.coef_[0]]
                + [float(model.intercept_[0])]
            ],
        )
    )

    return df_betas

To run an arbitrary stat, you need

   any function that takes a dataframe and a weight and returns a dataframe

The function will be run for each boostrap/replicate weight

   Note: you may need to import packages within the function

   if executed in parallel (it will get pickled and reloaded)

logger.info("The betas from the regression (no weights, implicate 0)")
y = "var_gbm2"
X = ["var2", "var3", "var4", "var5"]
df_betas = run_regression(df=df_implicates[0], y=y, X=X)
logger.info(df_betas)

The betas from the regression (no weights, implicate 0)

shape: (5, 2)
┌─────────────┬───────────┐
│ Variable    ┆ Beta      │
│ ---         ┆ ---       │
│ str         ┆ f64       │
╞═════════════╪═══════════╡
│ var2        ┆ 0.583644  │
│ var3        ┆ -0.330792 │
│ var4        ┆ -13.05279 │
│ var5        ┆ 3.005503  │
│ _Intercept_ ┆ 8.092053  │
└─────────────┴───────────┘

logger.info("Confirming how to run it on one implicate")
arguments_sc = dict(
    y=y,
    X=X,
)
replicates = Replicates(
    weight_stub="weight_", n_replicates=n_replicates, bootstrap=True
)
sc_compare = StatCalculator.from_function(
    run_regression,
    df=join_wrapper(df_implicates[0], df_weights, on=["index"], how="left"),
    estimate_ids=["Variable"],
    arguments=arguments_sc,
    replicates=replicates,
    display=False,
)
sc_compare.print()

Confirming how to run it on one implicate

Running run_regression

0

.

.

.

.

5

.

.

logger.info("Now the full function")
logger.info("   Note the nested arguments...")
logger.info("       Maybe not the best API, but it does mean")
logger.info("       that you can run ANYTHING that takes in a")
logger.info("       dataframe and a weight and returns a dataframe of estiamtes")
logger.info("        and run it through the bootstrap + MI process")
arguments_sc = dict(
    y=y,
    X=X,
)
arguments_mi = dict(
    delegate=run_regression,
    arguments=arguments_sc,
    estimate_ids=["Variable"],
    replicates=replicates,
)

mi_results_seq = mi_ses_from_function(
    delegate=StatCalculator.from_function,
    df_implicates=df_implicates,
    path_srmi=path_model,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments_mi,
    join_on=["Variable"],
    parallel=False,
)

mi_results_seq.print()

Now the full function

   Note the nested arguments...

       Maybe not the best API, but it does mean

       that you can run ANYTHING that takes in a

       dataframe and a weight and returns a dataframe of estiamtes

        and run it through the bootstrap + MI process

Implicate #1

Running run_regression

0

.

logger.info("For parallel, it's better to pass the path of the model")
logger.info("   To avoid unnecessary I/O")
logger.info("   (For sequential it doesn't really matter)")
mi_results = mi_ses_from_function(
    delegate=StatCalculator.from_function,
    # df_implicates=df_implicates,
    path_srmi=path_model,
    df_noimputes=df_weights,
    index=["index"],
    arguments=arguments_mi,
    join_on=["Variable"],
    parallel=True,
)

For parallel, it's better to pass the path of the model

   To avoid unnecessary I/O

   (For sequential it doesn't really matter)

Defaulting to shell for parallel estimation


Starting call:

Starting call at 0 seconds

      Because:    RunBecause.SetToRun

Function: _mi_ses_from_function_one_implicate
language: Languages.Python

BEGIN CODE

from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/3wtpdirw')
_mi_ses_from_function_one_implicate(implicate_number=0,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/3wtpdirw_implicate_0',**d_implicate)


END CODE

logger.info("The results")
mi_results.print()
mi_results_seq.print()

The results

┌─────────────┬────────────┐
│    Variable ┆       Beta │
╞═════════════╪════════════╡
│        var2 ┆   0.573306 │
│             ┆   0.080237 │
│        var3 ┆  -0.333865 │
│             ┆   0.015985 │
│        var4 ┆ -13.123991 │
│             ┆   0.596763 │
│        var5 ┆   2.755537 │
│             ┆   0.707092 │
│ _Intercept_ ┆   8.422204 │
│             ┆   1.035693 │
└─────────────┴────────────┘

┌─────────────┬────────────┐
│    Variable ┆       Beta │
╞═════════════╪════════════╡
│        var2 ┆   0.573306 │
│             ┆   0.080237 │
│        var3 ┆  -0.333865 │
│             ┆   0.015985 │
│        var4 ┆ -13.123991 │
│             ┆   0.596763 │
│        var5 ┆   2.755537 │
│             ┆   0.707092 │
│ _Intercept_ ┆   8.422204 │
│             ┆   1.035693 │
└─────────────┴────────────┘

logger.info("Compare two sets of results")
d_comparison = mi_results.compare(mi_results_seq)
d_comparison["difference"].print()
d_comparison["ratio"].print()

Compare two sets of results

┌─────────────┬──────┐
│    Variable ┆ Beta │
╞═════════════╪══════╡
│        var2 ┆  0.0 │
│             ┆  0.0 │
│        var3 ┆  0.0 │
│             ┆  0.0 │
│        var4 ┆  0.0 │
│             ┆  0.0 │
│        var5 ┆  0.0 │
│             ┆  0.0 │
│ _Intercept_ ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘

┌─────────────┬──────┐
│    Variable ┆ Beta │
╞═════════════╪══════╡
│        var2 ┆  0.0 │
│             ┆  0.0 │
│        var3 ┆  0.0 │
│             ┆  0.0 │
│        var4 ┆  0.0 │
│             ┆  0.0 │
│        var5 ┆  0.0 │
│             ┆  0.0 │
│ _Intercept_ ┆  0.0 │
│             ┆  0.0 │
└─────────────┴──────┘