import sys
import os
from pathlib import Path

import narwhals as nw
import polars as pl
import polars.selectors as cs

from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary

from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.imputation.selection import Selection
import survey_kit.imputation.utilities.lightgbm_wrapper as rep_lgbm
from survey_kit.imputation.utilities.lightgbm_wrapper import Tuner_optuna

from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list

# Draw some random data

n_rows = 10_000
impute_share = 0.25


df = (
    RandomData(n_rows=n_rows, seed=32565437)
    .index("index")
    .integer("year", 2016, 2020)
    .integer("month", 1, 12)
    .integer("var2", 0, 10)
    .integer("var3", 0, 50)
    .float("var4", 0, 1)
    .integer("var5", 0, 1)
    .float("unrelated_1", 0, 1)
    .float("unrelated_2", 0, 1)
    .float("unrelated_3", 0, 1)
    .float("unrelated_4", 0, 1)
    .float("unrelated_5", 0, 1)
    .np_distribution("epsilon_gbm1", "normal", scale=5)
    .np_distribution("epsilon_gbm2", "normal", scale=5)
    .np_distribution("epsilon_gbm3", "normal", scale=5)
    .float("missing_gbm1", 0, 1)
    .float("missing_gbm2", 0, 1)
    .float("missing_gbm3", 0, 1)
    .to_df()
)


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")

c_e_gbm1 = pl.col("epsilon_gbm1")
c_e_gbm2 = pl.col("epsilon_gbm2")


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")


logger.info("var_gbm1 is binary and conditional on other variables")
c_gbm1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_gbm1)  > 0).alias("var_gbm1")

logger.info("var_gbm2 is != 0 only if var_gbm1 == True")
c_gbm2 = (
    pl.when(pl.col("var_gbm1"))
      .then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_gbm2)))
      .otherwise(pl.lit(0))
      .alias("var_gbm2")
)

c_gbm3 = (
    pl.when(pl.col("var_gbm1"))
      .then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_gbm2)))
      .otherwise(pl.lit(0))
      .alias("var_gbm3")
)
#   Create a bunch of variables that are functions of the variables created above
df = (
    df.with_columns(c_gbm1)
    .with_columns(c_gbm2, c_gbm3)
    .drop(columns_from_list(df=df, columns="epsilon*"))
    .with_row_index(name="_row_index_")
)
df_original = df

#   Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["gbm"]:
    for i in range(1, 4):
        vari = f"var_{prefixi}{i}"
        missingi = f"missing_{prefixi}{i}"

        clear_missing.append(
            pl.when(pl.col(missingi) < impute_share)
            .then(pl.lit(None))
            .otherwise(pl.col(vari))
            .alias(vari)
        )
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))

#   Make a fully collinear var for testing
df = df.with_columns(pl.col("unrelated_1").alias("repeat_1"))


summary(df)

var_gbm1 is binary and conditional on other variables

var_gbm2 is != 0 only if var_gbm1 == True

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│    var_gbm1 ┆ 10,000 ┆       2,483 ┆   0.526008 ┆    0.499356 ┆        0.0 ┆       1.0 │
│    var_gbm2 ┆ 10,000 ┆       2,464 ┆  -2.393041 ┆   10.384672 ┆ -55.354108 ┆ 26.213084 │
│    var_gbm3 ┆ 10,000 ┆       2,596 ┆  -2.500425 ┆   10.365353 ┆ -55.354108 ┆ 26.213084 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘

logger.info("Define some dummy functions to run after imputation of 2")
#   Test a simple pre-post function
#       These would get run gets run in each iteration (in each implicate)
#           before (preFunctions) or after (postFunctions) this variable is imputed
#   Notes for these functions:
#       1) No type hints on imported package types (will throw an error)
#           i.e. no df:pl.DataFrame or -> pl.DataFrame
#       2) Must be completely self-contained (i.e. all imports within the function)
#           This has to do with how it gets saved and loaded in async calls
#       3) Effectively, you have to assume it'll be called
#           in an environment with no imports before it
def square_var(df, var_to_square: str, name: str):
    import narwhals as nw

    return (
        nw.from_native(df)
        .with_columns((nw.col(var_to_square) ** 2).alias(name))
        .to_native()
    )


def recalculate_interaction(df, var1: str, var2: str, name: str):
    import narwhals as nw

    return (
        nw.from_native(df)
        .with_columns((nw.col(var1) * nw.col(var2)).alias(name))
        .to_native()
    )

Define some dummy functions to run after imputation of 2

logger.info("Set up hyperparameter tuning")
tuner = Tuner_optuna(
    n_trials=50, objective=rep_lgbm.Tuner.Objectives.mae, test_size=0.25
)

logger.info("   Set the tuner parameters to the defaults")
tuner.parameters()

logger.info("   Then specify ranges to check between as follow")
tuner.hyperparameters["num_leaves"] = [2, 256]
tuner.hyperparameters["max_depth"] = [2, 256]
tuner.hyperparameters["min_data_in_leaf"] = [10, 250]
tuner.hyperparameters["num_iterations"] = [25, 200]
tuner.hyperparameters["bagging_fraction"] = [0.5, 1]
tuner.hyperparameters["bagging_freq"] = [1, 5]




vars_impute = []

Set up hyperparameter tuning

   Set the tuner parameters to the defaults

Setting default optuna sampler: TPESampler

[I 2025-11-07 14:22:32,308] A new study created in memory with name: no-name-66ac4f39-3d4e-4306-a054-c37b1c0b6041

   Then specify ranges to check between as follow

logger.info("Impute the boolean variable (var_gbm1)")
logger.info("   to the default setup for predicted mean matching")
logger.info("   using lightgbm")
logger.info("   (you can pass a formula, but you don't need to)")

logger.info("First, set up the lightgbm parameters")
logger.info("   This says, do hyperparameter tuning first (tune)")
logger.info("   Redo it at each run (tune_overwrite)")
logger.info("   And sets the lightgbm parameter defaults (parameters) that the tuning can overwrite")
parameters_lgbm1 = Parameters.LightGBM(
    tune=True,
    tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
    tuner=tuner,
    tune_overwrite=True,
    parameters={
        "objective": "binary",
        "num_leaves": 32,
        "min_data_in_leaf": 20,
        "num_iterations": 100,
        "test_size": 0.2,
        "boosting": "gbdt",
        "categorical_feature": ["var5"],
        "verbose": -1,  # ,
    },
    error=Parameters.ErrorDraw.pmm,
)


logger.info("Actually define the variable and the model")
v_gbm1 = Variable(
    impute_var="var_gbm1",
    model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
    modeltype=Variable.ModelType.LightGBM,
    parameters=parameters_lgbm1
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_gbm1)






logger.info("Impute the continuous variable (var_gbm2) ")
logger.info("   conditional on var_gbm1, using narwhals (nw.col('var_gbm1'))")
logger.info("   as well as a post-processing edit to set var_gbm2=0 when var_gbm1==0")
logger.info("   and some other random post-processing")
logger.info("Different parameters for the continuous variable")
parameters_lgbm2 = Parameters.LightGBM(
    tune=True,
    tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
    tuner=tuner,
    tune_overwrite=True,
    parameters={
        "objective": "regression",
        "num_leaves": 32,
        "min_data_in_leaf": 20,
        "num_iterations": 100,
        "test_size": 0.2,
        "boosting": "gbdt",
        "categorical_feature": ["var5"],
        "verbose": -1,  # ,
    },
    error=Parameters.ErrorDraw.pmm,
)

v_gbm2 = Variable(
    impute_var="var_gbm2",
    Where=nw.col("var_gbm1"),
    #   Needed in case var_gbm1 changes between iterations
    Where_predict=(nw.col("var_gbm2") != 0),
    model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
    modeltype=Variable.ModelType.LightGBM,
    parameters=parameters_lgbm2,
    postFunctions=[
        (
            nw.when(nw.col("var_gbm1"))
            .then(nw.col("var_gbm2"))
            .otherwise(nw.lit(0))
            .alias("var_gbm2")
        ),
        Variable.PrePost.Function(
            recalculate_interaction,
            parameters=dict(
                var1="var_gbm1",
                var2="var_gbm2",
                name="var_gbm12"
            ),
        ),
        Variable.PrePost.Function(
            square_var,
            parameters=dict(
                var_to_square="var_gbm2", 
                name="var_gbm2_sq"
            )
        ),
    ]
)

vars_impute.append(v_gbm2)




logger.info("Now do one with the quantile-regression lightgbm")
logger.info("   To do this, pass quantiles and set objective='quantile'")
parameters_lgbm3 = Parameters.LightGBM(
    tune=True,
    tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
    tuner=tuner,
    tune_overwrite=True,
    quantiles=[0.25,0.5,0.75],
    parameters={
        "objective": "quantile",
        "num_leaves": 32,
        "min_data_in_leaf": 20,
        "num_iterations": 100,
        "test_size": 0.2,
        "boosting": "gbdt",
        "categorical_feature": ["var5"],
        "verbose": -1,  # ,
    },
    error=Parameters.ErrorDraw.pmm,
)

v_gbm3 = Variable(
    impute_var="var_gbm3",
    Where=nw.col("var_gbm1"),
    #   Needed in case var_gbm1 changes between iterations
    Where_predict=(nw.col("var_gbm3") != 0),
    model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
    modeltype=Variable.ModelType.LightGBM,
    parameters=parameters_lgbm3,
    postFunctions=[
        (
            nw.when(nw.col("var_gbm1"))
            .then(nw.col("var_gbm3"))
            .otherwise(nw.lit(0))
            .alias("var_gbm3")
        )
    ]
)

vars_impute.append(v_gbm3)

Impute the boolean variable (var_gbm1)

   to the default setup for predicted mean matching

   using lightgbm

   (you can pass a formula, but you don't need to)

First, set up the lightgbm parameters

   This says, do hyperparameter tuning first (tune)

   Redo it at each run (tune_overwrite)

   And sets the lightgbm parameter defaults (parameters) that the tuning can overwrite

Actually define the variable and the model

Add the variable to the list to be imputed

logger.info("Set up the imputation")
srmi = SRMI(
    df=df,
    variables=vars_impute,
    n_implicates=2,
    n_iterations=2,
    parallel=False,
    index=["index"],
    modeltype=Variable.ModelType.pmm,
    bayesian_bootstrap=True,
    path_model=f"{config.path_temp_files}/py_srmi_test_gbm",
    force_start=True,
)

Set up the imputation

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi

logger.info("Run it")
srmi.run()

logger.info("It's automatically saved and can be loaded with (see path_model above):")
logger.info("path_model = f'{config.path_temp_files}/py_srmi_test_gbm'")
logger.info("srmi = SRMI.load(path_model)")

Run it

Variable selection before SRMI run, if necessary

     var_gbm1: Method.No

     var_gbm2: Method.No

     var_gbm3: Method.No

Hyperparameter tuning before SRMI run, if necessary

[I 2025-11-07 14:22:32,575] Trial 0 finished with value: 0.006293613988163579 and parameters: {'num_leaves': 38, 'max_depth': 177, 'min_data_in_leaf': 81, 'num_iterations': 94, 'bagging_fraction': 0.6071214413167514, 'bagging_freq': 2}. Best is trial 0 with value: 0.006293613988163579.

[I 2025-11-07 14:22:32,626] Trial 1 finished with value: 0.02030944315858492 and parameters: {'num_leaves': 83, 'max_depth': 27, 'min_data_in_leaf': 177, 'num_iterations': 41, 'bagging_fraction': 0.9889998163277448, 'bagging_freq': 3}. Best is trial 0 with value: 0.006293613988163579.

[I 2025-11-07 14:22:32,679] Trial 2 finished with value: 0.0079340733621239 and parameters: {'num_leaves': 12, 'max_depth': 158, 'min_data_in_leaf': 93, 'num_iterations': 77, 'bagging_fraction': 0.6507690841429046, 'bagging_freq': 1}. Best is trial 0 with value: 0.006293613988163579.

[I 2025-11-07 14:22:32,705] Trial 3 finished with value: 0.05234521224018576 and parameters: {'num_leaves': 82, 'max_depth': 230, 'min_data_in_leaf': 248, 'num_iterations': 30, 'bagging_fraction': 0.6575054481666343, 'bagging_freq': 5}. Best is trial 0 with value: 0.006293613988163579.

logger.info("Get the results")
_ = df_list = srmi.df_implicates

Get the results

logger.info("\n\nLook at the original")
_ = summary(df_original,detailed=True,drb_round=True)

logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary,detailed=True,drb_round=True)

logger.info("\n\nLook at the imputes | var_gbm1 == 0")
_ = df_list.filter(~nw.col("var_gbm1")).pipe(summary,detailed=True,drb_round=True)

logger.info("\n\nLook at the imputes | var_gbm1 == 1")
_ = df_list.filter(nw.col("var_gbm1")).pipe(summary,detailed=True,drb_round=True)


Look at the original

┌──────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│     Variable ┆      n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞══════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│  _row_index_ ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│        index ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,018.0 ┆   1.416 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│        month ┆ 10,000 ┆           0 ┆   6.514 ┆   3.432 ┆       1.0 ┆     4.0 ┆     6.0 ┆     9.0 ┆    12.0 │
│         var2 ┆ 10,000 ┆           0 ┆   4.978 ┆   3.155 ┆       0.0 ┆     2.0 ┆     5.0 ┆     8.0 ┆    10.0 │
│         var3 ┆ 10,000 ┆           0 ┆   25.11 ┆   14.75 ┆       0.0 ┆    12.0 ┆    25.0 ┆    38.0 ┆    50.0 │
│         var4 ┆ 10,000 ┆           0 ┆  0.5057 ┆  0.2879 ┆  0.000027 ┆  0.2559 ┆  0.5086 ┆  0.7543 ┆     1.0 │
│         var5 ┆ 10,000 ┆           0 ┆  0.4999 ┆     0.5 ┆       0.0 ┆     0.0 ┆     0.0 ┆     1.0 ┆     1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆  0.5024 ┆  0.2884 ┆ 0.0001191 ┆   0.253 ┆  0.5037 ┆   0.753 ┆     1.0 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆  0.5001 ┆  0.2876 ┆  0.000049 ┆  0.2531 ┆  0.4975 ┆  0.7487 ┆  0.9995 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆  0.4992 ┆  0.2888 ┆  0.000129 ┆  0.2513 ┆  0.4961 ┆  0.7508 ┆  0.9999 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆  0.5007 ┆  0.2887 ┆ 0.0001329 ┆  0.2501 ┆  0.5006 ┆   0.752 ┆     1.0 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆  0.4988 ┆   0.289 ┆  0.000071 ┆  0.2496 ┆  0.4969 ┆    0.75 ┆  0.9999 │
│ missing_gbm1 ┆ 10,000 ┆           0 ┆  0.5026 ┆   0.289 ┆  0.000006 ┆  0.2517 ┆  0.5074 ┆  0.7523 ┆     1.0 │
│ missing_gbm2 ┆ 10,000 ┆           0 ┆  0.5032 ┆  0.2905 ┆  0.000005 ┆  0.2531 ┆  0.5041 ┆   0.756 ┆     1.0 │
│ missing_gbm3 ┆ 10,000 ┆           0 ┆  0.4939 ┆  0.2907 ┆ 0.0001079 ┆  0.2408 ┆  0.4907 ┆  0.7412 ┆  0.9998 │
│     var_gbm1 ┆ 10,000 ┆           0 ┆  0.5229 ┆  0.4995 ┆       0.0 ┆     0.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│     var_gbm2 ┆ 10,000 ┆           0 ┆  -2.402 ┆   10.33 ┆    -55.35 ┆  -3.023 ┆     0.0 ┆     0.0 ┆   26.21 │
│     var_gbm3 ┆ 10,000 ┆           0 ┆  -2.402 ┆   10.33 ┆    -55.35 ┆  -3.023 ┆     0.0 ┆     0.0 ┆   26.21 │
└──────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘


Look at the imputes

┌─────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│    Variable ┆      n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞═════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│       index ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,018.0 ┆   1.416 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│       month ┆ 10,000 ┆           0 ┆   6.514 ┆   3.432 ┆       1.0 ┆     4.0 ┆     6.0 ┆     9.0 ┆    12.0 │
│        var2 ┆ 10,000 ┆           0 ┆   4.978 ┆   3.155 ┆       0.0 ┆     2.0 ┆     5.0 ┆     8.0 ┆    10.0 │
│        var3 ┆ 10,000 ┆           0 ┆   25.11 ┆   14.75 ┆       0.0 ┆    12.0 ┆    25.0 ┆    38.0 ┆    50.0 │
│        var4 ┆ 10,000 ┆           0 ┆  0.5057 ┆  0.2879 ┆  0.000027 ┆  0.2559 ┆  0.5086 ┆  0.7543 ┆     1.0 │
│        var5 ┆ 10,000 ┆           0 ┆  0.4999 ┆     0.5 ┆       0.0 ┆     0.0 ┆     0.0 ┆     1.0 ┆     1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆  0.5024 ┆  0.2884 ┆ 0.0001191 ┆   0.253 ┆  0.5037 ┆   0.753 ┆     1.0 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆  0.5001 ┆  0.2876 ┆  0.000049 ┆  0.2531 ┆  0.4975 ┆  0.7487 ┆  0.9995 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆  0.4992 ┆  0.2888 ┆  0.000129 ┆  0.2513 ┆  0.4961 ┆  0.7508 ┆  0.9999 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆  0.5007 ┆  0.2887 ┆ 0.0001329 ┆  0.2501 ┆  0.5006 ┆   0.752 ┆     1.0 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆  0.4988 ┆   0.289 ┆  0.000071 ┆  0.2496 ┆  0.4969 ┆    0.75 ┆  0.9999 │
│    repeat_1 ┆ 10,000 ┆           0 ┆  0.5024 ┆  0.2884 ┆ 0.0001191 ┆   0.253 ┆  0.5037 ┆   0.753 ┆     1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆  0.5272 ┆  0.4993 ┆       0.0 ┆     0.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.406 ┆    10.2 ┆    -55.35 ┆  -2.753 ┆     0.0 ┆     0.0 ┆   26.21 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.363 ┆   10.15 ┆    -55.35 ┆  -2.785 ┆     0.0 ┆     0.0 ┆   26.21 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.406 ┆    10.2 ┆    -55.35 ┆  -2.753 ┆     0.0 ┆     0.0 ┆   26.21 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆   109.8 ┆   271.1 ┆       0.0 ┆     0.0 ┆  0.2379 ┆   80.29 ┆ 3,064.0 │
└─────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘

┌─────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│    Variable ┆      n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞═════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│       index ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│ _row_index_ ┆ 10,000 ┆           0 ┆ 5,000.0 ┆ 2,887.0 ┆       0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,018.0 ┆   1.416 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│       month ┆ 10,000 ┆           0 ┆   6.514 ┆   3.432 ┆       1.0 ┆     4.0 ┆     6.0 ┆     9.0 ┆    12.0 │
│        var2 ┆ 10,000 ┆           0 ┆   4.978 ┆   3.155 ┆       0.0 ┆     2.0 ┆     5.0 ┆     8.0 ┆    10.0 │
│        var3 ┆ 10,000 ┆           0 ┆   25.11 ┆   14.75 ┆       0.0 ┆    12.0 ┆    25.0 ┆    38.0 ┆    50.0 │
│        var4 ┆ 10,000 ┆           0 ┆  0.5057 ┆  0.2879 ┆  0.000027 ┆  0.2559 ┆  0.5086 ┆  0.7543 ┆     1.0 │
│        var5 ┆ 10,000 ┆           0 ┆  0.4999 ┆     0.5 ┆       0.0 ┆     0.0 ┆     0.0 ┆     1.0 ┆     1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆  0.5024 ┆  0.2884 ┆ 0.0001191 ┆   0.253 ┆  0.5037 ┆   0.753 ┆     1.0 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆  0.5001 ┆  0.2876 ┆  0.000049 ┆  0.2531 ┆  0.4975 ┆  0.7487 ┆  0.9995 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆  0.4992 ┆  0.2888 ┆  0.000129 ┆  0.2513 ┆  0.4961 ┆  0.7508 ┆  0.9999 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆  0.5007 ┆  0.2887 ┆ 0.0001329 ┆  0.2501 ┆  0.5006 ┆   0.752 ┆     1.0 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆  0.4988 ┆   0.289 ┆  0.000071 ┆  0.2496 ┆  0.4969 ┆    0.75 ┆  0.9999 │
│    repeat_1 ┆ 10,000 ┆           0 ┆  0.5024 ┆  0.2884 ┆ 0.0001191 ┆   0.253 ┆  0.5037 ┆   0.753 ┆     1.0 │
│    var_gbm1 ┆ 10,000 ┆           0 ┆  0.5276 ┆  0.4993 ┆       0.0 ┆     0.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│    var_gbm2 ┆ 10,000 ┆           0 ┆  -2.334 ┆   10.15 ┆    -55.35 ┆  -2.737 ┆     0.0 ┆     0.0 ┆   26.21 │
│    var_gbm3 ┆ 10,000 ┆           0 ┆  -2.323 ┆    10.1 ┆    -55.35 ┆  -2.745 ┆     0.0 ┆     0.0 ┆   26.21 │
│   var_gbm12 ┆ 10,000 ┆           0 ┆  -2.334 ┆   10.15 ┆    -55.35 ┆  -2.737 ┆     0.0 ┆     0.0 ┆   26.21 │
│ var_gbm2_sq ┆ 10,000 ┆           0 ┆   108.4 ┆   266.6 ┆       0.0 ┆     0.0 ┆  0.1858 ┆   80.87 ┆ 3,064.0 │
└─────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘


Look at the imputes | var_gbm1 == 0

┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│    Variable ┆     n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│       index ┆ 4,700 ┆           0 ┆ 5,060.0 ┆ 2,893.0 ┆       1.0 ┆ 2,572.0 ┆ 5,084.0 ┆ 7,631.0 ┆ 9,997.0 │
│ _row_index_ ┆ 4,700 ┆           0 ┆ 5,060.0 ┆ 2,893.0 ┆       1.0 ┆ 2,572.0 ┆ 5,084.0 ┆ 7,631.0 ┆ 9,997.0 │
│        year ┆ 4,700 ┆           0 ┆ 2,018.0 ┆   1.417 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│       month ┆ 4,700 ┆           0 ┆   6.553 ┆   3.416 ┆       1.0 ┆     4.0 ┆     7.0 ┆     9.0 ┆    12.0 │
│        var2 ┆ 4,700 ┆           0 ┆   4.626 ┆   3.238 ┆       0.0 ┆     2.0 ┆     4.0 ┆     7.0 ┆    10.0 │
│        var3 ┆ 4,700 ┆           0 ┆   24.79 ┆   13.02 ┆       0.0 ┆    14.0 ┆    25.0 ┆    36.0 ┆    50.0 │
│        var4 ┆ 4,700 ┆           0 ┆  0.5086 ┆  0.2885 ┆  0.000027 ┆  0.2545 ┆  0.5123 ┆  0.7586 ┆     1.0 │
│        var5 ┆ 4,700 ┆           0 ┆  0.8997 ┆  0.3004 ┆       0.0 ┆     1.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│ unrelated_1 ┆ 4,700 ┆           0 ┆  0.5033 ┆  0.2868 ┆ 0.0001191 ┆  0.2579 ┆  0.5035 ┆  0.7517 ┆     1.0 │
│ unrelated_2 ┆ 4,700 ┆           0 ┆   0.502 ┆  0.2866 ┆  0.000079 ┆   0.262 ┆  0.4973 ┆  0.7501 ┆  0.9995 │
│ unrelated_3 ┆ 4,700 ┆           0 ┆  0.5016 ┆  0.2895 ┆ 0.0001387 ┆  0.2521 ┆  0.4987 ┆   0.755 ┆  0.9999 │
│ unrelated_4 ┆ 4,700 ┆           0 ┆  0.5014 ┆  0.2898 ┆ 0.0001414 ┆  0.2506 ┆  0.5012 ┆  0.7568 ┆  0.9999 │
│ unrelated_5 ┆ 4,700 ┆           0 ┆  0.5018 ┆  0.2917 ┆  0.000071 ┆  0.2498 ┆  0.4999 ┆  0.7542 ┆  0.9998 │
│    repeat_1 ┆ 4,700 ┆           0 ┆  0.5033 ┆  0.2868 ┆ 0.0001191 ┆  0.2579 ┆  0.5035 ┆  0.7517 ┆     1.0 │
│    var_gbm1 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│    var_gbm2 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│    var_gbm3 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│   var_gbm12 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│ var_gbm2_sq ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
└─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘

┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│    Variable ┆     n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│       index ┆ 4,700 ┆           0 ┆ 5,066.0 ┆ 2,896.0 ┆       1.0 ┆ 2,576.0 ┆ 5,090.0 ┆ 7,645.0 ┆ 9,997.0 │
│ _row_index_ ┆ 4,700 ┆           0 ┆ 5,066.0 ┆ 2,896.0 ┆       1.0 ┆ 2,576.0 ┆ 5,090.0 ┆ 7,645.0 ┆ 9,997.0 │
│        year ┆ 4,700 ┆           0 ┆ 2,018.0 ┆   1.418 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│       month ┆ 4,700 ┆           0 ┆   6.546 ┆   3.415 ┆       1.0 ┆     4.0 ┆     6.0 ┆     9.0 ┆    12.0 │
│        var2 ┆ 4,700 ┆           0 ┆   4.617 ┆   3.238 ┆       0.0 ┆     2.0 ┆     4.0 ┆     7.0 ┆    10.0 │
│        var3 ┆ 4,700 ┆           0 ┆   24.94 ┆   12.99 ┆       0.0 ┆    14.0 ┆    25.0 ┆    36.0 ┆    50.0 │
│        var4 ┆ 4,700 ┆           0 ┆  0.5088 ┆  0.2885 ┆  0.000027 ┆  0.2543 ┆  0.5137 ┆  0.7586 ┆     1.0 │
│        var5 ┆ 4,700 ┆           0 ┆  0.8984 ┆  0.3022 ┆       0.0 ┆     1.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│ unrelated_1 ┆ 4,700 ┆           0 ┆  0.5032 ┆  0.2869 ┆ 0.0001191 ┆  0.2578 ┆  0.5029 ┆  0.7526 ┆     1.0 │
│ unrelated_2 ┆ 4,700 ┆           0 ┆  0.5017 ┆  0.2866 ┆  0.000079 ┆  0.2621 ┆  0.4961 ┆  0.7502 ┆  0.9995 │
│ unrelated_3 ┆ 4,700 ┆           0 ┆  0.5031 ┆  0.2897 ┆ 0.0001387 ┆  0.2534 ┆   0.501 ┆  0.7574 ┆  0.9999 │
│ unrelated_4 ┆ 4,700 ┆           0 ┆  0.5012 ┆  0.2898 ┆ 0.0001414 ┆  0.2504 ┆  0.5008 ┆  0.7558 ┆  0.9999 │
│ unrelated_5 ┆ 4,700 ┆           0 ┆  0.5016 ┆  0.2915 ┆  0.000071 ┆  0.2495 ┆  0.4995 ┆  0.7551 ┆  0.9998 │
│    repeat_1 ┆ 4,700 ┆           0 ┆  0.5032 ┆  0.2869 ┆ 0.0001191 ┆  0.2578 ┆  0.5029 ┆  0.7526 ┆     1.0 │
│    var_gbm1 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│    var_gbm2 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│    var_gbm3 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│   var_gbm12 ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
│ var_gbm2_sq ┆ 4,700 ┆           0 ┆     0.0 ┆     0.0 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     0.0 │
└─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘


Look at the imputes | var_gbm1 == 1

┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐
│    Variable ┆     n ┆ n (missing) ┆    mean ┆     std ┆       min ┆     q25 ┆     q50 ┆     q75 ┆     max │
╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡
│       index ┆ 5,300 ┆           0 ┆ 4,945.0 ┆ 2,880.0 ┆       0.0 ┆ 2,453.0 ┆ 4,920.0 ┆ 7,400.0 ┆ 9,999.0 │
│ _row_index_ ┆ 5,300 ┆           0 ┆ 4,945.0 ┆ 2,880.0 ┆       0.0 ┆ 2,453.0 ┆ 4,920.0 ┆ 7,400.0 ┆ 9,999.0 │
│        year ┆ 5,300 ┆           0 ┆ 2,018.0 ┆   1.415 ┆   2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │
│       month ┆ 5,300 ┆           0 ┆   6.479 ┆   3.446 ┆       1.0 ┆     4.0 ┆     6.0 ┆     9.0 ┆    12.0 │
│        var2 ┆ 5,300 ┆           0 ┆   5.294 ┆   3.044 ┆       0.0 ┆     3.0 ┆     5.0 ┆     8.0 ┆    10.0 │
│        var3 ┆ 5,300 ┆           0 ┆   25.39 ┆   16.15 ┆       0.0 ┆    10.0 ┆    26.0 ┆    40.0 ┆    50.0 │
│        var4 ┆ 5,300 ┆           0 ┆   0.503 ┆  0.2873 ┆ 0.0001044 ┆  0.2569 ┆  0.5071 ┆  0.7521 ┆  0.9999 │
│        var5 ┆ 5,300 ┆           0 ┆  0.1413 ┆  0.3484 ┆       0.0 ┆     0.0 ┆     0.0 ┆     0.0 ┆     1.0 │
│ unrelated_1 ┆ 5,300 ┆           0 ┆  0.5017 ┆  0.2898 ┆ 0.0002485 ┆  0.2506 ┆  0.5043 ┆  0.7562 ┆  0.9993 │
│ unrelated_2 ┆ 5,300 ┆           0 ┆  0.4984 ┆  0.2886 ┆  0.000049 ┆  0.2462 ┆  0.4976 ┆  0.7463 ┆  0.9995 │
│ unrelated_3 ┆ 5,300 ┆           0 ┆   0.497 ┆  0.2881 ┆  0.000129 ┆  0.2508 ┆  0.4932 ┆  0.7473 ┆  0.9999 │
│ unrelated_4 ┆ 5,300 ┆           0 ┆     0.5 ┆  0.2877 ┆ 0.0001329 ┆    0.25 ┆  0.4995 ┆  0.7475 ┆     1.0 │
│ unrelated_5 ┆ 5,300 ┆           0 ┆   0.496 ┆  0.2865 ┆ 0.0001807 ┆  0.2496 ┆  0.4938 ┆  0.7454 ┆  0.9999 │
│    repeat_1 ┆ 5,300 ┆           0 ┆  0.5017 ┆  0.2898 ┆ 0.0002485 ┆  0.2506 ┆  0.5043 ┆  0.7562 ┆  0.9993 │
│    var_gbm1 ┆ 5,300 ┆           0 ┆     1.0 ┆     0.0 ┆       1.0 ┆     1.0 ┆     1.0 ┆     1.0 ┆     1.0 │
│    var_gbm2 ┆ 5,300 ┆           0 ┆  -4.564 ┆   13.69 ┆    -55.35 ┆  -12.55 ┆  -1.855 ┆   5.651 ┆   26.21 │
│    var_gbm3 ┆ 5,300 ┆           0 ┆  -4.481 ┆   13.63 ┆    -55.35 ┆  -12.44 ┆  -1.832 ┆   5.733 ┆   26.21 │
│   var_gbm12 ┆ 5,300 ┆           0 ┆  -4.564 ┆   13.69 ┆    -55.35 ┆  -12.55 ┆  -1.855 ┆   5.651 ┆   26.21 │
│ var_gbm2_sq ┆ 5,300 ┆           0 ┆   208.3 ┆   344.9 ┆       0.0 ┆   14.31 ┆   72.18 ┆   220.9 ┆ 3,064.0 │
└─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘