In [1]:
import sys
import os
from pathlib import Path

import narwhals as nw
import polars as pl
import polars.selectors as cs

from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary

from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.imputation.selection import Selection

from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.utilities.formula_builder import FormulaBuilder


path = Path(config.code_root)
sys.path.append(os.path.normpath(path.parent.parent / "tests"))
from scratch import path_scratch


config.data_root = path_scratch(temp_file_suffix=False)
In [2]:
# Draw some random data

n_rows = 10_000
impute_share = 0.25


df = (
    RandomData(n_rows=n_rows, seed=32565437)
    .index("index")
    .integer("year", 2016, 2020)
    .integer("month", 1, 12)
    .integer("var2", 0, 10)
    .integer("var3", 0, 50)
    .float("var4", 0, 1)
    .integer("var5", 0, 1)
    .float("unrelated_1", 0, 1)
    .float("unrelated_2", 0, 1)
    .float("unrelated_3", 0, 1)
    .float("unrelated_4", 0, 1)
    .float("unrelated_5", 0, 1)
    .np_distribution("epsilon_reg1", "normal", scale=5)
    .np_distribution("epsilon_reg2", "normal", scale=5)
    .float("missing_reg1", 0, 1)
    .float("missing_reg2", 0, 1)
    .to_df()
)


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")

c_e_reg1 = pl.col("epsilon_reg1")
c_e_reg2 = pl.col("epsilon_reg2")


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")


logger.info("var_reg1 is binary and conditional on other variables")
c_reg1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_reg1)  > 0).alias("var_reg1")

logger.info("var_reg2 is != 0 only if var_reg1 == True")
c_reg2 = (
    pl.when(pl.col("var_reg1"))
      .then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_reg2)))
      .otherwise(pl.lit(0))
      .alias("var_reg2")
)
#   Create a bunch of variables that are functions of the variables created above
df = (
    df.with_columns(c_reg1)
    .with_columns(c_reg2)
    .drop(columns_from_list(df=df, columns="epsilon*"))
    .with_row_index(name="_row_index_")
)

df_original = df

#   Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["reg"]:
    for i in range(1, 3):
        vari = f"var_{prefixi}{i}"
        missingi = f"missing_{prefixi}{i}"

        clear_missing.append(
            pl.when(pl.col(missingi) < impute_share)
            .then(pl.lit(None))
            .otherwise(pl.col(vari))
            .alias(vari)
        )
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))

#   Make a fully collinear var for testing
df = df.with_columns(pl.col("unrelated_1").alias("repeat_1"))


summary(df)


#   Actually do the imputation
var_reg1 is binary and conditional on other variables
var_reg2 is != 0 only if var_reg1 == True
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│    var_reg1 ┆ 10,000 ┆       2,569 ┆   0.523079 ┆    0.499501 ┆        0.0 ┆       1.0 │
│    var_reg2 ┆ 10,000 ┆       2,476 ┆  -2.353019 ┆   10.232008 ┆ -55.354108 ┆ 26.213084 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
Out[2]:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

UNION

PLAN 0:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["_row_index_".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("_row_index__max").alias("max"), col("_row_index__rawn").alias("n"), col("_row_index__std").alias("std"), col("_row_index__min").alias("min"), col("_row_index__mean").alias("mean"), col("_row_index__rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("_row_index__max"), col("_row_index__rawn"), col("_row_index__std"), col("_row_index__min"), col("_row_index__mean"), col("_row_index__rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 1:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["index".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("index_max").alias("max"), col("index_rawn").alias("n"), col("index_std").alias("std"), col("index_min").alias("min"), col("index_mean").alias("mean"), col("index_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("index_max"), col("index_rawn"), col("index_std"), col("index_min"), col("index_mean"), col("index_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 2:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["year".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("year_max").alias("max"), col("year_rawn").alias("n"), col("year_std").alias("std"), col("year_min").alias("min"), col("year_mean").alias("mean"), col("year_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("year_max"), col("year_rawn"), col("year_std"), col("year_min"), col("year_mean"), col("year_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 3:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["month".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("month_max").alias("max"), col("month_rawn").alias("n"), col("month_std").alias("std"), col("month_min").alias("min"), col("month_mean").alias("mean"), col("month_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("month_max"), col("month_rawn"), col("month_std"), col("month_min"), col("month_mean"), col("month_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 4:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var2".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var2_max").alias("max"), col("var2_rawn").alias("n"), col("var2_std").alias("std"), col("var2_min").alias("min"), col("var2_mean").alias("mean"), col("var2_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var2_max"), col("var2_rawn"), col("var2_std"), col("var2_min"), col("var2_mean"), col("var2_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 5:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var3".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var3_max").alias("max"), col("var3_rawn").alias("n"), col("var3_std").alias("std"), col("var3_min").alias("min"), col("var3_mean").alias("mean"), col("var3_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var3_max"), col("var3_rawn"), col("var3_std"), col("var3_min"), col("var3_mean"), col("var3_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 6:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var4".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var4_max").alias("max"), col("var4_rawn").alias("n"), col("var4_std").alias("std"), col("var4_min").alias("min"), col("var4_mean").alias("mean"), col("var4_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var4_max"), col("var4_rawn"), col("var4_std"), col("var4_min"), col("var4_mean"), col("var4_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 7:

WITH_COLUMNS:

[col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var5".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var5_max").alias("max"), col("var5_rawn").alias("n"), col("var5_std").alias("std"), col("var5_min").alias("min"), col("var5_mean").alias("mean"), col("var5_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var5_max"), col("var5_rawn"), col("var5_std"), col("var5_min"), col("var5_mean"), col("var5_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 8:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["unrelated_1".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("unrelated_1_max").alias("max"), col("unrelated_1_rawn").alias("n"), col("unrelated_1_std").alias("std"), col("unrelated_1_min").alias("min"), col("unrelated_1_mean").alias("mean"), col("unrelated_1_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("unrelated_1_max"), col("unrelated_1_rawn"), col("unrelated_1_std"), col("unrelated_1_min"), col("unrelated_1_mean"), col("unrelated_1_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 9:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["unrelated_2".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("unrelated_2_max").alias("max"), col("unrelated_2_rawn").alias("n"), col("unrelated_2_std").alias("std"), col("unrelated_2_min").alias("min"), col("unrelated_2_mean").alias("mean"), col("unrelated_2_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("unrelated_2_max"), col("unrelated_2_rawn"), col("unrelated_2_std"), col("unrelated_2_min"), col("unrelated_2_mean"), col("unrelated_2_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 10:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["unrelated_3".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("unrelated_3_max").alias("max"), col("unrelated_3_rawn").alias("n"), col("unrelated_3_std").alias("std"), col("unrelated_3_min").alias("min"), col("unrelated_3_mean").alias("mean"), col("unrelated_3_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("unrelated_3_max"), col("unrelated_3_rawn"), col("unrelated_3_std"), col("unrelated_3_min"), col("unrelated_3_mean"), col("unrelated_3_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 11:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["unrelated_4".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("unrelated_4_max").alias("max"), col("unrelated_4_rawn").alias("n"), col("unrelated_4_std").alias("std"), col("unrelated_4_min").alias("min"), col("unrelated_4_mean").alias("mean"), col("unrelated_4_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("unrelated_4_max"), col("unrelated_4_rawn"), col("unrelated_4_std"), col("unrelated_4_min"), col("unrelated_4_mean"), col("unrelated_4_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 12:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["unrelated_5".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("unrelated_5_max").alias("max"), col("unrelated_5_rawn").alias("n"), col("unrelated_5_std").alias("std"), col("unrelated_5_min").alias("min"), col("unrelated_5_mean").alias("mean"), col("unrelated_5_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("unrelated_5_max"), col("unrelated_5_rawn"), col("unrelated_5_std"), col("unrelated_5_min"), col("unrelated_5_mean"), col("unrelated_5_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 13:

WITH_COLUMNS:

[col("min").strict_cast(Float64), col("max").strict_cast(Float64)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var_reg1".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var_reg1_max").alias("max"), col("var_reg1_rawn").alias("n"), col("var_reg1_std").alias("std"), col("var_reg1_min").alias("min"), col("var_reg1_mean").alias("mean"), col("var_reg1_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var_reg1_max"), col("var_reg1_rawn"), col("var_reg1_std"), col("var_reg1_min"), col("var_reg1_mean"), col("var_reg1_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 14:

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["var_reg2".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("var_reg2_max").alias("max"), col("var_reg2_rawn").alias("n"), col("var_reg2_std").alias("std"), col("var_reg2_min").alias("min"), col("var_reg2_mean").alias("mean"), col("var_reg2_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("var_reg2_max"), col("var_reg2_rawn"), col("var_reg2_std"), col("var_reg2_min"), col("var_reg2_mean"), col("var_reg2_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

PLAN 15:

WITH_COLUMNS:

[col("n (missing)").cast(Int16)]

SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")]

WITH_COLUMNS:

["repeat_1".alias("Variable")]

SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")]

SELECT [col("___index___"), col("repeat_1_max").alias("max"), col("repeat_1_rawn").alias("n"), col("repeat_1_std").alias("std"), col("repeat_1_min").alias("min"), col("repeat_1_mean").alias("mean"), col("repeat_1_rawn_missing").alias("n (missing)")]

SELECT [col("___index___"), col("repeat_1_max"), col("repeat_1_rawn"), col("repeat_1_std"), col("repeat_1_min"), col("repeat_1_mean"), col("repeat_1_rawn_missing")]

DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS

END UNION
In [3]:
logger.info("Define the regression model (intentionally include some extraneous variables")

f_model = FormulaBuilder(df=df)
f_model.formula_with_varnames_in_brackets(
    "~1+{var_*}+var2+var4+var4*var3*C(var5)+{unrelated_*}+{repeat_*}"
)
logger.info(f_model.formula)
Define the regression model (intentionally include some extraneous variables
~1+var_reg1+var_reg2+var2+var4+var4*var3*C(var5)+unrelated_1+unrelated_2+unrelated_3+unrelated_4+unrelated_5+repeat_1
In [4]:
# Set up the variable to be imputed
vars_impute = []

logger.info("Impute the boolean variable (var_reg1)")
logger.info("   to the default setup for predicted mean matching")
logger.info("   using logit regression")
v_reg1 = Variable(
    impute_var="var_reg1",
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    parameters=Parameters.Regression(model=Parameters.RegressionModel.Logit)
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_reg1)

logger.info("Impute the continuous variable (var_reg2) ")
logger.info("   conditional on var_reg1, using narwhals (nw.col('var_reg1'))")
logger.info("   by setting the model type")
logger.info("   and the formula")
logger.info("   as well as a post-processing edit to set var_reg2=0 when var_reg1==0")
v_reg2 = Variable(
    impute_var="var_reg2",
    Where=nw.col("var_reg1"),
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    #   Default parameters
    parameters=Parameters.Regression(),
    postFunctions=(
        nw.when(nw.col("var_reg1"))
          .then(nw.col("var_reg2"))
          .otherwise(nw.lit(0))
          .alias("var_reg2")
    )
)

vars_impute.append(v_reg2)
Impute the boolean variable (var_reg1)
   to the default setup for predicted mean matching
   using logit regression
Add the variable to the list to be imputed
Impute the continuous variable (var_reg2) 
   conditional on var_reg1, using narwhals (nw.col('var_reg1'))
   by setting the model type
   and the formula
   as well as a post-processing edit to set var_reg2=0 when var_reg1==0
In [5]:
logger.info("Set up the imputation")
logger.info("Add LASSO selection before each imputation")
srmi = SRMI(
    df=df,
    variables=vars_impute,
    n_implicates=2,
    n_iterations=2,
    parallel=False,
    selection=Selection(method=Selection.Method.LASSO),
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    bayesian_bootstrap=True,
    path_model=f"{path_scratch()}/py_srmi_test_regression",
    force_start=True,
)
Set up the imputation
Add LASSO selection before each imputation
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi
Dropping var_reg1 from formula
Dropping var_reg2 from formula
In [6]:
logger.info("Run it")
srmi.run()
Run it
Variable selection before SRMI run, if necessary
     var_reg1: Method.No
     var_reg2: Method.No
Hyperparameter tuning before SRMI run, if necessary
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
Dropping var_reg2 from formula
     Running variable selection: Method.LASSO
         Selected model: ~0+var2+C(var5)+repeat_1+unrelated_5+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var3+var4+var4+var3
     Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
R2 = 0.5353
┌──────────────────────────┬───────────┐
│                 Variable ┆      Beta │
╞══════════════════════════╪═══════════╡
│                     var2 ┆    0.2183 │
│           C(var5)[False] ┆      1.85 │
│            C(var5)[True] ┆    -2.315 │
│                 repeat_1 ┆   -0.1076 │
│              unrelated_5 ┆   -0.1383 │
│              unrelated_3 ┆   0.07106 │
│              unrelated_1 ┆   -0.1076 │
│                     var3 ┆  0.000017 │
│                     var4 ┆   0.08597 │
│     var3:C(var5)[T.True] ┆ -0.005871 │
│     var4:C(var5)[T.True] ┆   -0.6536 │
│ var4:var3:C(var5)[False] ┆ -0.009138 │
│  var4:var3:C(var5)[True] ┆   0.03426 │
│              _Intercept_ ┆   -0.4963 │
└──────────────────────────┴───────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg1']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 1165         ┆ 4       │
│ 4847         ┆ 4       │
│ 6428         ┆ 4       │
│ 192          ┆ 3       │
│ 224          ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg1']
    Where:          None
    Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆     n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg1 ┆         ┆ 10000 ┆        10000 ┆ 0.5283 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       0 ┆  7431 ┆         7431 ┆ 0.5231 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       1 ┆  2569 ┆         2569 ┆ 0.5434 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
└──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




     Running variable selection: Method.LASSO
         Selected model: ~0+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_4+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var3+var3+var4+var4+var3
     Imputation using OLS regression with PMM matching
R2 = 0.8111
┌───────────────────────────┬───────────┐
│                  Variable ┆      Beta │
╞═══════════════════════════╪═══════════╡
│                      var2 ┆     1.351 │
│            C(var5)[False] ┆    0.5782 │
│             C(var5)[True] ┆   -0.5782 │
│                  repeat_1 ┆   0.09655 │
│               unrelated_4 ┆ -0.001992 │
│               unrelated_2 ┆    0.4366 │
│               unrelated_1 ┆   0.09655 │
│                      var4 ┆    0.8138 │
│                      var3 ┆   0.01687 │
│                 var4:var3 ┆   -0.9972 │
│      var3:C(var5)[T.True] ┆  -0.04554 │
│      var4:C(var5)[T.True] ┆     4.446 │
│ var4:var3:C(var5)[T.True] ┆   0.04509 │
│               _Intercept_ ┆   -0.6182 │
└───────────────────────────┴───────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg2']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 731          ┆ 4       │
│ 1282         ┆ 4       │
│ 4331         ┆ 4       │
│ 51           ┆ 3       │
│ 474          ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg2']
    Where:          col(var_reg1)
    Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆    n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg2 ┆         ┆ 5283 ┆         5283 ┆ -4.101 ┆       -4.367 ┆       13.72 ┆      -24.14 ┆      -12.96 ┆      -1.938 ┆       6.015 ┆       11.24 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       0 ┆ 3965 ┆         3965 ┆ -4.199 ┆       -4.462 ┆       13.65 ┆      -24.07 ┆      -12.92 ┆      -2.104 ┆       5.767 ┆       11.04 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       1 ┆ 1318 ┆         1318 ┆ -3.805 ┆       -4.081 ┆       13.94 ┆      -24.53 ┆      -13.14 ┆      -1.446 ┆       6.807 ┆       11.51 ┆      -55.35 ┆       24.89 │
└──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
[ 6.73066716e-02 -0.00000000e+00  0.00000000e+00  3.71367698e-01
 -0.00000000e+00 -1.07062599e-03 -0.00000000e+00 -3.72161168e-04
 -0.00000000e+00 -5.38546132e-03 -4.70160709e-07  0.00000000e+00
 -5.77949554e-03  1.71858786e-03  1.74925402e-02]
[ 2.55784315e+00 -2.60566968e-14  2.55667576e+00  0.00000000e+00
 -0.00000000e+00 -4.58348548e+00  5.69456439e-02  3.94256009e-02
  0.00000000e+00 -1.51974623e-02  0.00000000e+00  1.40407334e-04
 -9.66030779e+00  2.09934930e+00 -3.24711385e-01  4.41965182e+00]
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
     Running variable selection: Method.LASSO
         Selected model: ~0+var3+var4+var2+C(var5)+repeat_1+var_reg2+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)
     Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
R2 = 0.6020
┌───────────────────────────┬──────────┐
│                  Variable ┆     Beta │
╞═══════════════════════════╪══════════╡
│                      var3 ┆ 0.004236 │
│                      var4 ┆   0.5434 │
│                      var2 ┆   0.2035 │
│            C(var5)[False] ┆     2.19 │
│             C(var5)[True] ┆   -2.078 │
│                  repeat_1 ┆  -0.0735 │
│                  var_reg2 ┆  -0.1184 │
│               unrelated_5 ┆  -0.3512 │
│               unrelated_4 ┆  -0.1332 │
│               unrelated_3 ┆  0.02857 │
│               unrelated_2 ┆  -0.2581 │
│               unrelated_1 ┆  -0.0735 │
│                 var4:var3 ┆ -0.09465 │
│      var3:C(var5)[T.True] ┆ -0.01237 │
│      var4:C(var5)[T.True] ┆  -0.5246 │
│ var4:var3:C(var5)[T.True] ┆  0.06802 │
│               _Intercept_ ┆   0.1145 │
└───────────────────────────┴──────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg1']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 4154         ┆ 4       │
│ 275          ┆ 3       │
│ 489          ┆ 3       │
│ 916          ┆ 3       │
│ 1428         ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg1']
    Where:          None
    Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆     n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg1 ┆         ┆ 12569 ┆        12569 ┆ 0.5261 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       0 ┆ 10000 ┆        10000 ┆ 0.5283 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       1 ┆  2569 ┆         2569 ┆ 0.5173 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
└──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




     Running variable selection: Method.LASSO
         Selected model: ~0+var3+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var4+var4
     Imputation using OLS regression with PMM matching
R2 = 0.8037
┌───────────────────────────┬──────────┐
│                  Variable ┆     Beta │
╞═══════════════════════════╪══════════╡
│                      var3 ┆  0.03414 │
│                      var2 ┆    1.378 │
│            C(var5)[False] ┆   0.3067 │
│             C(var5)[True] ┆  -0.3067 │
│                  repeat_1 ┆   0.1819 │
│               unrelated_5 ┆ 0.003208 │
│               unrelated_4 ┆  -0.0218 │
│               unrelated_3 ┆   0.2923 │
│               unrelated_1 ┆   0.1819 │
│                      var4 ┆    1.279 │
│                 var4:var3 ┆   -1.014 │
│      var3:C(var5)[T.True] ┆  -0.0864 │
│      var4:C(var5)[T.True] ┆    3.087 │
│ var4:var3:C(var5)[T.True] ┆   0.1758 │
│               _Intercept_ ┆  -0.6994 │
└───────────────────────────┴──────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg2']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 903          ┆ 3       │
│ 1374         ┆ 3       │
│ 1581         ┆ 3       │
│ 2288         ┆ 3       │
│ 3173         ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg2']
    Where:          col(var_reg1)
    Where (impute): col(___imp_missing_var_reg2_2)
[-1.45298426e-01  7.94160626e-02  7.65124658e-03  5.31405328e-03
  4.05410599e-01 -3.55928686e-16 -3.28522843e-03 -3.08145353e-04
 -2.25157225e-03 -1.13798554e-03 -4.41409458e-03 -3.73075092e-06
 -1.55053474e-01  3.06222340e-03 -2.17534484e-02  1.17422292e-01]
[ 2.67119698e+00 -3.39673395e-14  2.48291904e+00  0.00000000e+00
 -4.94651969e-02 -4.89330719e+00  2.34282673e-02 -0.00000000e+00
 -1.82999899e-02 -3.46458337e-02 -5.33177612e-02  4.90561207e-04
 -9.50912160e+00  1.72184415e+00 -2.67490420e-01  5.26295423e+00]
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆    n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg2 ┆         ┆ 6522 ┆         6522 ┆  -4.08 ┆       -4.428 ┆       13.82 ┆      -24.55 ┆      -13.14 ┆      -1.997 ┆       6.061 ┆       11.33 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       0 ┆ 5216 ┆         5216 ┆ -4.046 ┆       -4.382 ┆       13.73 ┆      -24.31 ┆      -12.96 ┆      -1.955 ┆       5.999 ┆       11.27 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       1 ┆ 1306 ┆         1306 ┆ -4.216 ┆       -4.611 ┆       14.16 ┆      -25.21 ┆      -13.72 ┆      -2.414 ┆       6.428 ┆       11.75 ┆      -55.35 ┆       26.21 │
└──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)

var_reg1

var_reg2

Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
Dropping var_reg2 from formula
     Running variable selection: Method.LASSO
         Selected model: ~0+var2+C(var5)+repeat_1+unrelated_5+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var3+var4+var4+var3
     Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
R2 = 0.5358
┌──────────────────────────┬───────────┐
│                 Variable ┆      Beta │
╞══════════════════════════╪═══════════╡
│                     var2 ┆    0.1956 │
│           C(var5)[False] ┆     2.013 │
│            C(var5)[True] ┆    -2.175 │
│                 repeat_1 ┆   -0.1157 │
│              unrelated_5 ┆   -0.4133 │
│              unrelated_3 ┆   -0.1533 │
│              unrelated_1 ┆   -0.1157 │
│                     var3 ┆ -0.003659 │
│                     var4 ┆   0.08833 │
│     var3:C(var5)[T.True] ┆ -0.002562 │
│     var4:C(var5)[T.True] ┆   -0.9745 │
│ var4:var3:C(var5)[False] ┆ -0.000146 │
│  var4:var3:C(var5)[True] ┆   0.03792 │
│              _Intercept_ ┆    -0.176 │
└──────────────────────────┴───────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg1']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 1951         ┆ 4       │
│ 1969         ┆ 4       │
│ 3986         ┆ 4       │
│ 5434         ┆ 4       │
│ 8815         ┆ 4       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg1']
    Where:          None
    Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆     n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg1 ┆         ┆ 10000 ┆        10000 ┆ 0.5243 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       0 ┆  7431 ┆         7431 ┆ 0.5231 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       1 ┆  2569 ┆         2569 ┆ 0.5278 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
└──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




     Running variable selection: Method.LASSO
         Selected model: ~0+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var3+var3+var4+var4+var3
     Imputation using OLS regression with PMM matching
R2 = 0.8163
┌───────────────────────────┬──────────┐
│                  Variable ┆     Beta │
╞═══════════════════════════╪══════════╡
│                      var2 ┆    1.372 │
│            C(var5)[False] ┆    1.077 │
│             C(var5)[True] ┆   -1.077 │
│                  repeat_1 ┆   0.1153 │
│               unrelated_4 ┆   -1.118 │
│               unrelated_3 ┆   0.1726 │
│               unrelated_2 ┆  -0.1515 │
│               unrelated_1 ┆   0.1153 │
│                      var4 ┆   0.6333 │
│                      var3 ┆ 0.004142 │
│                 var4:var3 ┆   -1.001 │
│      var3:C(var5)[T.True] ┆  0.01229 │
│      var4:C(var5)[T.True] ┆    5.903 │
│ var4:var3:C(var5)[T.True] ┆ -0.02945 │
│               _Intercept_ ┆   -0.171 │
└───────────────────────────┴──────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg2']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 3986         ┆ 4       │
│ 5072         ┆ 4       │
│ 1485         ┆ 3       │
│ 2571         ┆ 3       │
│ 4036         ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg2']
    Where:          col(var_reg1)
    Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆    n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg2 ┆         ┆ 5243 ┆         5243 ┆ -4.393 ┆       -4.639 ┆       13.88 ┆      -24.66 ┆       -13.3 ┆      -2.131 ┆       5.791 ┆       11.32 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       0 ┆ 3934 ┆         3934 ┆ -4.275 ┆       -4.515 ┆       13.73 ┆      -24.31 ┆       -13.0 ┆      -2.144 ┆       5.784 ┆       11.19 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       1 ┆ 1309 ┆         1309 ┆ -4.748 ┆       -5.012 ┆       14.33 ┆       -26.2 ┆      -13.97 ┆      -2.089 ┆       5.791 ┆       11.67 ┆      -55.35 ┆       24.89 │
└──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
[ 6.73066716e-02 -0.00000000e+00  0.00000000e+00  3.71367698e-01
 -0.00000000e+00 -1.07062599e-03 -0.00000000e+00 -3.72161168e-04
 -0.00000000e+00 -5.38546132e-03 -4.70160709e-07  0.00000000e+00
 -5.77949554e-03  1.71858786e-03  1.74925402e-02]
[ 2.61113153e+00 -4.23284371e-14  2.60300215e+00  0.00000000e+00
 -0.00000000e+00 -4.62318054e+00  7.24762103e-02  4.17414062e-02
  6.79245937e-04 -2.26481611e-02 -0.00000000e+00  1.85148780e-04
 -9.63486912e+00  2.06696684e+00 -2.86280726e-01  4.41981803e+00]
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
     Running variable selection: Method.LASSO
         Selected model: ~0+var3+var4+var2+C(var5)+repeat_1+var_reg2+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:var3:C(var5)
     Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
R2 = 0.5999
┌───────────────────────────┬──────────┐
│                  Variable ┆     Beta │
╞═══════════════════════════╪══════════╡
│                      var3 ┆  0.01428 │
│                      var4 ┆   0.5788 │
│                      var2 ┆   0.2365 │
│            C(var5)[False] ┆     2.23 │
│             C(var5)[True] ┆   -2.367 │
│                  repeat_1 ┆ -0.08821 │
│                  var_reg2 ┆  -0.1217 │
│               unrelated_5 ┆  -0.4341 │
│               unrelated_4 ┆  -0.3981 │
│               unrelated_3 ┆ -0.07955 │
│               unrelated_2 ┆ -0.06151 │
│               unrelated_1 ┆ -0.08821 │
│                 var4:var3 ┆  -0.1067 │
│      var3:C(var5)[T.True] ┆ -0.01085 │
│ var4:var3:C(var5)[T.True] ┆  0.06463 │
│               _Intercept_ ┆  -0.1442 │
└───────────────────────────┴──────────┘
     Finding 10 nearest neighbors on ['___prediction']
     Randomly picking one and donating ['var_reg1']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 17           ┆ 4       │
│ 4095         ┆ 4       │
│ 5072         ┆ 4       │
│ 221          ┆ 3       │
│ 956          ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg1']
    Where:          None
    Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆     n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg1 ┆         ┆ 12569 ┆        12569 ┆ 0.5226 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       0 ┆ 10000 ┆        10000 ┆ 0.5243 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
│ var_reg1 ┆       1 ┆  2569 ┆         2569 ┆ 0.5162 ┆            1 ┆           0 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 ┆           1 │
└──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




     Running variable selection: Method.LASSO
         Selected model: ~0+var3+var4+var2+C(var5)+var_reg1+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)
     Imputation using OLS regression with PMM matching
R2 = 0.8185
┌───────────────────────────┬──────────┐
│                  Variable ┆     Beta │
╞═══════════════════════════╪══════════╡
│                      var3 ┆  0.02064 │
│                      var4 ┆    1.082 │
│                      var2 ┆     1.33 │
│            C(var5)[False] ┆   0.5624 │
│             C(var5)[True] ┆  -0.5624 │
│               unrelated_5 ┆  -0.4497 │
│               unrelated_4 ┆  -0.1819 │
│               unrelated_3 ┆  -0.1995 │
│               unrelated_2 ┆   0.1406 │
│                 var4:var3 ┆    -1.02 │
│      var3:C(var5)[T.True] ┆ -0.01145 │
│      var4:C(var5)[T.True] ┆    4.584 │
│ var4:var3:C(var5)[T.True] ┆  0.00607 │
│               _Intercept_ ┆  0.01821 │
└───────────────────────────┴──────────┘
     Finding 10 nearest neighbors on ['___prediction']
[-1.48978929e-01  8.56240169e-02  1.06448209e-02  8.71445031e-03
  4.02266676e-01 -5.68046904e-17 -2.17848333e-03  6.38086984e-04
  1.57094965e-04 -2.79957055e-03 -5.98274298e-03 -4.20249720e-06
 -1.57639625e-01 -0.00000000e+00 -1.64056537e-02  1.14564804e-01]
[ 2.88964501e+00 -8.84771027e-14  2.44263969e+00  6.39779260e-02
 -5.01120737e-03 -4.96586606e+00  0.00000000e+00 -6.27382024e-02
  1.09089409e-02 -1.12658865e-01 -6.71704330e-02  0.00000000e+00
 -9.78013377e+00  1.61462238e+00 -2.20516178e-01  5.35854096e+00]
     Randomly picking one and donating ['var_reg2']
     Most common matches: 
shape: (5, 2)
┌──────────────┬─────────┐
│ ___rownumber ┆ nDonors │
│ ---          ┆ ---     │
│ i16          ┆ i8      │
╞══════════════╪═════════╡
│ 1160         ┆ 3       │
│ 2049         ┆ 3       │
│ 3438         ┆ 3       │
│ 4080         ┆ 3       │
│ 4243         ┆ 3       │
└──────────────┴─────────┘


Post-imputation statistics for ['var_reg2']
    Where:          col(var_reg1)
    Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ Variable ┆ Imputed ┆    n ┆ n (not null) ┆   mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │
╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ var_reg2 ┆         ┆ 6516 ┆         6516 ┆  -4.36 ┆       -4.734 ┆       14.02 ┆      -25.03 ┆      -13.61 ┆      -2.154 ┆       5.784 ┆       11.37 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       0 ┆ 5213 ┆         5213 ┆ -4.351 ┆       -4.715 ┆       13.94 ┆      -24.85 ┆      -13.48 ┆      -2.209 ┆       5.767 ┆       11.32 ┆      -55.35 ┆       26.21 │
│ var_reg2 ┆       1 ┆ 1303 ┆         1303 ┆ -4.395 ┆       -4.809 ┆       14.33 ┆      -25.35 ┆      -14.22 ┆       -2.07 ┆       5.912 ┆       11.75 ┆      -55.35 ┆       24.89 │
└──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘




Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)

var_reg1

var_reg2

Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
In [7]:
logger.info("Get the results")
_ = df_list = srmi.df_implicates
Get the results
In [8]:
logger.info("\n\nLook at the original")
_ = summary(df_original)

logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary)

logger.info("\n\nLook at the imputes | var_reg1 == 0")
_ = df_list.filter(~nw.col("var_reg1")).pipe(summary)

logger.info("\n\nLook at the imputes | var_reg1 == 1")
_ = df_list.filter(nw.col("var_reg1")).pipe(summary)

Look at the original
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│ missing_reg1 ┆ 10,000 ┆           0 ┆   0.495784 ┆    0.289051 ┆   0.000161 ┆   0.99991 │
│ missing_reg2 ┆ 10,000 ┆           0 ┆   0.502597 ┆    0.288468 ┆   0.000006 ┆  0.999963 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5229 ┆      0.4995 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆  -2.402477 ┆   10.331745 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘

Look at the imputes
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│     repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5216 ┆    0.499558 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆  -2.180802 ┆    9.844918 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│     repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5213 ┆    0.499571 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆   -2.23131 ┆    9.939851 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘

Look at the imputes | var_reg1 == 0
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐
│     Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆      min ┆      max │
╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡
│ ___rownumber ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│  _row_index_ ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│        index ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│         year ┆ 4,784 ┆           0 ┆  2,017.99561 ┆     1.420329 ┆  2,016.0 ┆  2,020.0 │
│        month ┆ 4,784 ┆           0 ┆     6.572742 ┆     3.407715 ┆      1.0 ┆     12.0 │
│         var2 ┆ 4,784 ┆           0 ┆     4.628135 ┆     3.233716 ┆      0.0 ┆     10.0 │
│         var3 ┆ 4,784 ┆           0 ┆    24.966137 ┆    13.395857 ┆      0.0 ┆     50.0 │
│         var4 ┆ 4,784 ┆           0 ┆     0.507489 ┆     0.289287 ┆ 0.000027 ┆ 0.999997 │
│         var5 ┆ 4,784 ┆           0 ┆     0.886288 ┆     0.317495 ┆      0.0 ┆      1.0 │
│  unrelated_1 ┆ 4,784 ┆           0 ┆      0.50384 ┆     0.286077 ┆ 0.000119 ┆ 0.999921 │
│  unrelated_2 ┆ 4,784 ┆           0 ┆     0.499727 ┆     0.285433 ┆ 0.000079 ┆ 0.999528 │
│  unrelated_3 ┆ 4,784 ┆           0 ┆     0.501752 ┆     0.289139 ┆ 0.000139 ┆  0.99994 │
│  unrelated_4 ┆ 4,784 ┆           0 ┆      0.50239 ┆     0.289704 ┆ 0.000141 ┆ 0.999961 │
│  unrelated_5 ┆ 4,784 ┆           0 ┆     0.501731 ┆     0.290909 ┆ 0.000071 ┆  0.99984 │
│     repeat_1 ┆ 4,784 ┆           0 ┆      0.50384 ┆     0.286077 ┆ 0.000119 ┆ 0.999921 │
│     var_reg1 ┆ 4,784 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
│     var_reg2 ┆ 4,784 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
└──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐
│     Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆      min ┆      max │
╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡
│ ___rownumber ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│  _row_index_ ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│        index ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│         year ┆ 4,787 ┆           0 ┆  2,017.99248 ┆     1.421562 ┆  2,016.0 ┆  2,020.0 │
│        month ┆ 4,787 ┆           0 ┆     6.585126 ┆     3.414898 ┆      1.0 ┆     12.0 │
│         var2 ┆ 4,787 ┆           0 ┆     4.609359 ┆     3.225133 ┆      0.0 ┆     10.0 │
│         var3 ┆ 4,787 ┆           0 ┆    24.976603 ┆    13.404733 ┆      0.0 ┆     50.0 │
│         var4 ┆ 4,787 ┆           0 ┆     0.508052 ┆     0.288739 ┆ 0.000027 ┆ 0.999997 │
│         var5 ┆ 4,787 ┆           0 ┆     0.885941 ┆     0.317916 ┆      0.0 ┆      1.0 │
│  unrelated_1 ┆ 4,787 ┆           0 ┆     0.503982 ┆     0.285823 ┆ 0.000119 ┆ 0.999921 │
│  unrelated_2 ┆ 4,787 ┆           0 ┆     0.499459 ┆     0.285463 ┆ 0.000079 ┆ 0.999528 │
│  unrelated_3 ┆ 4,787 ┆           0 ┆     0.501814 ┆     0.289014 ┆ 0.000139 ┆  0.99994 │
│  unrelated_4 ┆ 4,787 ┆           0 ┆     0.500316 ┆      0.28914 ┆ 0.000141 ┆ 0.999961 │
│  unrelated_5 ┆ 4,787 ┆           0 ┆     0.501014 ┆      0.29115 ┆ 0.000071 ┆  0.99984 │
│     repeat_1 ┆ 4,787 ┆           0 ┆     0.503982 ┆     0.285823 ┆ 0.000119 ┆ 0.999921 │
│     var_reg1 ┆ 4,787 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
│     var_reg2 ┆ 4,787 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
└──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘

Look at the imputes | var_reg1 == 1
┌──────────────┬───────┬─────────────┬─────────────┬──────────────┬────────────┬───────────┐
│     Variable ┆     n ┆ n (missing) ┆        mean ┆          std ┆        min ┆       max │
╞══════════════╪═══════╪═════════════╪═════════════╪══════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│        index ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│         year ┆ 5,216 ┆           0 ┆ 2,017.97546 ┆     1.411965 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 5,216 ┆           0 ┆    6.459548 ┆     3.453831 ┆        1.0 ┆      12.0 │
│         var2 ┆ 5,216 ┆           0 ┆    5.299271 ┆     3.045196 ┆        0.0 ┆      10.0 │
│         var3 ┆ 5,216 ┆           0 ┆    25.23888 ┆    15.895128 ┆        0.0 ┆      50.0 │
│         var4 ┆ 5,216 ┆           0 ┆    0.503993 ┆     0.286564 ┆   0.000104 ┆  0.999885 │
│         var5 ┆ 5,216 ┆           0 ┆    0.145514 ┆     0.352652 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 5,216 ┆           0 ┆    0.501172 ┆     0.290459 ┆   0.000248 ┆  0.999997 │
│  unrelated_2 ┆ 5,216 ┆           0 ┆    0.500452 ┆     0.289673 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 5,216 ┆           0 ┆    0.496812 ┆     0.288419 ┆   0.000129 ┆  0.999622 │
│  unrelated_4 ┆ 5,216 ┆           0 ┆    0.499064 ┆     0.287791 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 5,216 ┆           0 ┆    0.496036 ┆     0.287198 ┆   0.000181 ┆  0.999867 │
│     repeat_1 ┆ 5,216 ┆           0 ┆    0.501172 ┆     0.290459 ┆   0.000248 ┆  0.999997 │
│     var_reg1 ┆ 5,216 ┆           0 ┆         1.0 ┆          0.0 ┆        1.0 ┆       1.0 │
│     var_reg2 ┆ 5,216 ┆           0 ┆   -4.180985 ┆    13.321794 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴───────┴─────────────┴─────────────┴──────────────┴────────────┴───────────┘
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬────────────┬───────────┐
│     Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆        min ┆       max │
╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 5,213 ┆           0 ┆ 4,972.387109 ┆ 2,869.727124 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 5,213 ┆           0 ┆ 4,972.387109 ┆ 2,869.727124 ┆        0.0 ┆   9,999.0 │
│        index ┆ 5,213 ┆           0 ┆ 4,972.387109 ┆ 2,869.727124 ┆        0.0 ┆   9,999.0 │
│         year ┆ 5,213 ┆           0 ┆ 2,017.978323 ┆     1.410855 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 5,213 ┆           0 ┆      6.44811 ┆     3.446923 ┆        1.0 ┆      12.0 │
│         var2 ┆ 5,213 ┆           0 ┆       5.3169 ┆     3.049492 ┆        0.0 ┆      10.0 │
│         var3 ┆ 5,213 ┆           0 ┆    25.229426 ┆    15.889739 ┆        0.0 ┆      50.0 │
│         var4 ┆ 5,213 ┆           0 ┆     0.503474 ┆     0.287062 ┆   0.000104 ┆  0.999885 │
│         var5 ┆ 5,213 ┆           0 ┆     0.145406 ┆     0.352543 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 5,213 ┆           0 ┆      0.50104 ┆     0.290689 ┆   0.000248 ┆  0.999997 │
│  unrelated_2 ┆ 5,213 ┆           0 ┆     0.500698 ┆     0.289648 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 5,213 ┆           0 ┆     0.496752 ┆     0.288533 ┆   0.000129 ┆  0.999622 │
│  unrelated_4 ┆ 5,213 ┆           0 ┆     0.500967 ┆     0.288319 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 5,213 ┆           0 ┆     0.496691 ┆     0.286984 ┆   0.000181 ┆  0.999867 │
│     repeat_1 ┆ 5,213 ┆           0 ┆      0.50104 ┆     0.290689 ┆   0.000248 ┆  0.999997 │
│     var_reg1 ┆ 5,213 ┆           0 ┆          1.0 ┆          0.0 ┆        1.0 ┆       1.0 │
│     var_reg2 ┆ 5,213 ┆           0 ┆     -4.28028 ┆    13.445184 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴───────┴─────────────┴──────────────┴──────────────┴────────────┴───────────┘