In [1]:
import sys
import os
from pathlib import Path
import narwhals as nw
import polars as pl
import polars.selectors as cs
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.imputation.selection import Selection
from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.utilities.formula_builder import FormulaBuilder
path = Path(config.code_root)
sys.path.append(os.path.normpath(path.parent.parent / "tests"))
from scratch import path_scratch
config.data_root = path_scratch(temp_file_suffix=False)
In [2]:
# Draw some random data
n_rows = 10_000
impute_share = 0.25
df = (
RandomData(n_rows=n_rows, seed=32565437)
.index("index")
.integer("year", 2016, 2020)
.integer("month", 1, 12)
.integer("var2", 0, 10)
.integer("var3", 0, 50)
.float("var4", 0, 1)
.integer("var5", 0, 1)
.float("unrelated_1", 0, 1)
.float("unrelated_2", 0, 1)
.float("unrelated_3", 0, 1)
.float("unrelated_4", 0, 1)
.float("unrelated_5", 0, 1)
.np_distribution("epsilon_reg1", "normal", scale=5)
.np_distribution("epsilon_reg2", "normal", scale=5)
.float("missing_reg1", 0, 1)
.float("missing_reg2", 0, 1)
.to_df()
)
# Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")
c_e_reg1 = pl.col("epsilon_reg1")
c_e_reg2 = pl.col("epsilon_reg2")
# Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")
logger.info("var_reg1 is binary and conditional on other variables")
c_reg1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_reg1) > 0).alias("var_reg1")
logger.info("var_reg2 is != 0 only if var_reg1 == True")
c_reg2 = (
pl.when(pl.col("var_reg1"))
.then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_reg2)))
.otherwise(pl.lit(0))
.alias("var_reg2")
)
# Create a bunch of variables that are functions of the variables created above
df = (
df.with_columns(c_reg1)
.with_columns(c_reg2)
.drop(columns_from_list(df=df, columns="epsilon*"))
.with_row_index(name="_row_index_")
)
df_original = df
# Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["reg"]:
for i in range(1, 3):
vari = f"var_{prefixi}{i}"
missingi = f"missing_{prefixi}{i}"
clear_missing.append(
pl.when(pl.col(missingi) < impute_share)
.then(pl.lit(None))
.otherwise(pl.col(vari))
.alias(vari)
)
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))
# Make a fully collinear var for testing
df = df.with_columns(pl.col("unrelated_1").alias("repeat_1"))
summary(df)
# Actually do the imputation
var_reg1 is binary and conditional on other variables
var_reg2 is != 0 only if var_reg1 == True
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ var_reg1 ┆ 10,000 ┆ 2,569 ┆ 0.523079 ┆ 0.499501 ┆ 0.0 ┆ 1.0 │ │ var_reg2 ┆ 10,000 ┆ 2,476 ┆ -2.353019 ┆ 10.232008 ┆ -55.354108 ┆ 26.213084 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ └─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
Out[2]:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)
SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] UNION PLAN 0: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["_row_index_".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("_row_index__max").alias("max"), col("_row_index__rawn").alias("n"), col("_row_index__std").alias("std"), col("_row_index__min").alias("min"), col("_row_index__mean").alias("mean"), col("_row_index__rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("_row_index__max"), col("_row_index__rawn"), col("_row_index__std"), col("_row_index__min"), col("_row_index__mean"), col("_row_index__rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 1: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["index".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("index_max").alias("max"), col("index_rawn").alias("n"), col("index_std").alias("std"), col("index_min").alias("min"), col("index_mean").alias("mean"), col("index_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("index_max"), col("index_rawn"), col("index_std"), col("index_min"), col("index_mean"), col("index_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 2: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["year".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("year_max").alias("max"), col("year_rawn").alias("n"), col("year_std").alias("std"), col("year_min").alias("min"), col("year_mean").alias("mean"), col("year_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("year_max"), col("year_rawn"), col("year_std"), col("year_min"), col("year_mean"), col("year_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 3: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["month".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("month_max").alias("max"), col("month_rawn").alias("n"), col("month_std").alias("std"), col("month_min").alias("min"), col("month_mean").alias("mean"), col("month_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("month_max"), col("month_rawn"), col("month_std"), col("month_min"), col("month_mean"), col("month_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 4: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var2".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var2_max").alias("max"), col("var2_rawn").alias("n"), col("var2_std").alias("std"), col("var2_min").alias("min"), col("var2_mean").alias("mean"), col("var2_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var2_max"), col("var2_rawn"), col("var2_std"), col("var2_min"), col("var2_mean"), col("var2_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 5: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var3".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var3_max").alias("max"), col("var3_rawn").alias("n"), col("var3_std").alias("std"), col("var3_min").alias("min"), col("var3_mean").alias("mean"), col("var3_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var3_max"), col("var3_rawn"), col("var3_std"), col("var3_min"), col("var3_mean"), col("var3_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 6: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var4".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var4_max").alias("max"), col("var4_rawn").alias("n"), col("var4_std").alias("std"), col("var4_min").alias("min"), col("var4_mean").alias("mean"), col("var4_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var4_max"), col("var4_rawn"), col("var4_std"), col("var4_min"), col("var4_mean"), col("var4_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 7: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var5".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var5_max").alias("max"), col("var5_rawn").alias("n"), col("var5_std").alias("std"), col("var5_min").alias("min"), col("var5_mean").alias("mean"), col("var5_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var5_max"), col("var5_rawn"), col("var5_std"), col("var5_min"), col("var5_mean"), col("var5_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 8: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_1".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("unrelated_1_max").alias("max"), col("unrelated_1_rawn").alias("n"), col("unrelated_1_std").alias("std"), col("unrelated_1_min").alias("min"), col("unrelated_1_mean").alias("mean"), col("unrelated_1_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("unrelated_1_max"), col("unrelated_1_rawn"), col("unrelated_1_std"), col("unrelated_1_min"), col("unrelated_1_mean"), col("unrelated_1_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 9: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_2".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("unrelated_2_max").alias("max"), col("unrelated_2_rawn").alias("n"), col("unrelated_2_std").alias("std"), col("unrelated_2_min").alias("min"), col("unrelated_2_mean").alias("mean"), col("unrelated_2_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("unrelated_2_max"), col("unrelated_2_rawn"), col("unrelated_2_std"), col("unrelated_2_min"), col("unrelated_2_mean"), col("unrelated_2_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 10: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_3".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("unrelated_3_max").alias("max"), col("unrelated_3_rawn").alias("n"), col("unrelated_3_std").alias("std"), col("unrelated_3_min").alias("min"), col("unrelated_3_mean").alias("mean"), col("unrelated_3_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("unrelated_3_max"), col("unrelated_3_rawn"), col("unrelated_3_std"), col("unrelated_3_min"), col("unrelated_3_mean"), col("unrelated_3_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 11: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_4".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("unrelated_4_max").alias("max"), col("unrelated_4_rawn").alias("n"), col("unrelated_4_std").alias("std"), col("unrelated_4_min").alias("min"), col("unrelated_4_mean").alias("mean"), col("unrelated_4_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("unrelated_4_max"), col("unrelated_4_rawn"), col("unrelated_4_std"), col("unrelated_4_min"), col("unrelated_4_mean"), col("unrelated_4_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 12: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_5".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("unrelated_5_max").alias("max"), col("unrelated_5_rawn").alias("n"), col("unrelated_5_std").alias("std"), col("unrelated_5_min").alias("min"), col("unrelated_5_mean").alias("mean"), col("unrelated_5_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("unrelated_5_max"), col("unrelated_5_rawn"), col("unrelated_5_std"), col("unrelated_5_min"), col("unrelated_5_mean"), col("unrelated_5_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 13: WITH_COLUMNS: [col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var_reg1".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var_reg1_max").alias("max"), col("var_reg1_rawn").alias("n"), col("var_reg1_std").alias("std"), col("var_reg1_min").alias("min"), col("var_reg1_mean").alias("mean"), col("var_reg1_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var_reg1_max"), col("var_reg1_rawn"), col("var_reg1_std"), col("var_reg1_min"), col("var_reg1_mean"), col("var_reg1_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 14: SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var_reg2".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("var_reg2_max").alias("max"), col("var_reg2_rawn").alias("n"), col("var_reg2_std").alias("std"), col("var_reg2_min").alias("min"), col("var_reg2_mean").alias("mean"), col("var_reg2_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("var_reg2_max"), col("var_reg2_rawn"), col("var_reg2_std"), col("var_reg2_min"), col("var_reg2_mean"), col("var_reg2_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS PLAN 15: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["repeat_1".alias("Variable")] SELECT [col("max"), col("n"), col("std"), col("min"), col("mean"), col("n (missing)")] SELECT [col("___index___"), col("repeat_1_max").alias("max"), col("repeat_1_rawn").alias("n"), col("repeat_1_std").alias("std"), col("repeat_1_min").alias("min"), col("repeat_1_mean").alias("mean"), col("repeat_1_rawn_missing").alias("n (missing)")] SELECT [col("___index___"), col("repeat_1_max"), col("repeat_1_rawn"), col("repeat_1_std"), col("repeat_1_min"), col("repeat_1_mean"), col("repeat_1_rawn_missing")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */97 COLUMNS END UNION
In [3]:
logger.info("Define the regression model (intentionally include some extraneous variables")
f_model = FormulaBuilder(df=df)
f_model.formula_with_varnames_in_brackets(
"~1+{var_*}+var2+var4+var4*var3*C(var5)+{unrelated_*}+{repeat_*}"
)
logger.info(f_model.formula)
Define the regression model (intentionally include some extraneous variables
~1+var_reg1+var_reg2+var2+var4+var4*var3*C(var5)+unrelated_1+unrelated_2+unrelated_3+unrelated_4+unrelated_5+repeat_1
In [4]:
# Set up the variable to be imputed
vars_impute = []
logger.info("Impute the boolean variable (var_reg1)")
logger.info(" to the default setup for predicted mean matching")
logger.info(" using logit regression")
v_reg1 = Variable(
impute_var="var_reg1",
modeltype=Variable.ModelType.pmm,
model=f_model.formula,
parameters=Parameters.Regression(model=Parameters.RegressionModel.Logit)
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_reg1)
logger.info("Impute the continuous variable (var_reg2) ")
logger.info(" conditional on var_reg1, using narwhals (nw.col('var_reg1'))")
logger.info(" by setting the model type")
logger.info(" and the formula")
logger.info(" as well as a post-processing edit to set var_reg2=0 when var_reg1==0")
v_reg2 = Variable(
impute_var="var_reg2",
Where=nw.col("var_reg1"),
modeltype=Variable.ModelType.pmm,
model=f_model.formula,
# Default parameters
parameters=Parameters.Regression(),
postFunctions=(
nw.when(nw.col("var_reg1"))
.then(nw.col("var_reg2"))
.otherwise(nw.lit(0))
.alias("var_reg2")
)
)
vars_impute.append(v_reg2)
Impute the boolean variable (var_reg1)
to the default setup for predicted mean matching
using logit regression
Add the variable to the list to be imputed
Impute the continuous variable (var_reg2)
conditional on var_reg1, using narwhals (nw.col('var_reg1'))
by setting the model type
and the formula
as well as a post-processing edit to set var_reg2=0 when var_reg1==0
In [5]:
logger.info("Set up the imputation")
logger.info("Add LASSO selection before each imputation")
srmi = SRMI(
df=df,
variables=vars_impute,
n_implicates=2,
n_iterations=2,
parallel=False,
selection=Selection(method=Selection.Method.LASSO),
modeltype=Variable.ModelType.pmm,
model=f_model.formula,
bayesian_bootstrap=True,
path_model=f"{path_scratch()}/py_srmi_test_regression",
force_start=True,
)
Set up the imputation
Add LASSO selection before each imputation
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi
Dropping var_reg1 from formula
Dropping var_reg2 from formula
In [6]:
logger.info("Run it")
srmi.run()
Run it
Variable selection before SRMI run, if necessary
var_reg1: Method.No
var_reg2: Method.No
Hyperparameter tuning before SRMI run, if necessary
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
Dropping var_reg2 from formula
Running variable selection: Method.LASSO
Selected model: ~0+var2+C(var5)+repeat_1+unrelated_5+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var3+var4+var4+var3
Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT
Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
R2 = 0.5353
┌──────────────────────────┬───────────┐ │ Variable ┆ Beta │ ╞══════════════════════════╪═══════════╡ │ var2 ┆ 0.2183 │ │ C(var5)[False] ┆ 1.85 │ │ C(var5)[True] ┆ -2.315 │ │ repeat_1 ┆ -0.1076 │ │ unrelated_5 ┆ -0.1383 │ │ unrelated_3 ┆ 0.07106 │ │ unrelated_1 ┆ -0.1076 │ │ var3 ┆ 0.000017 │ │ var4 ┆ 0.08597 │ │ var3:C(var5)[T.True] ┆ -0.005871 │ │ var4:C(var5)[T.True] ┆ -0.6536 │ │ var4:var3:C(var5)[False] ┆ -0.009138 │ │ var4:var3:C(var5)[True] ┆ 0.03426 │ │ _Intercept_ ┆ -0.4963 │ └──────────────────────────┴───────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg1']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 1165 ┆ 4 │ │ 4847 ┆ 4 │ │ 6428 ┆ 4 │ │ 192 ┆ 3 │ │ 224 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg1']
Where: None
Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg1 ┆ ┆ 10000 ┆ 10000 ┆ 0.5283 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 0 ┆ 7431 ┆ 7431 ┆ 0.5231 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 1 ┆ 2569 ┆ 2569 ┆ 0.5434 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Running variable selection: Method.LASSO
Selected model: ~0+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_4+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var3+var3+var4+var4+var3
Imputation using OLS regression with PMM matching
R2 = 0.8111
┌───────────────────────────┬───────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪═══════════╡ │ var2 ┆ 1.351 │ │ C(var5)[False] ┆ 0.5782 │ │ C(var5)[True] ┆ -0.5782 │ │ repeat_1 ┆ 0.09655 │ │ unrelated_4 ┆ -0.001992 │ │ unrelated_2 ┆ 0.4366 │ │ unrelated_1 ┆ 0.09655 │ │ var4 ┆ 0.8138 │ │ var3 ┆ 0.01687 │ │ var4:var3 ┆ -0.9972 │ │ var3:C(var5)[T.True] ┆ -0.04554 │ │ var4:C(var5)[T.True] ┆ 4.446 │ │ var4:var3:C(var5)[T.True] ┆ 0.04509 │ │ _Intercept_ ┆ -0.6182 │ └───────────────────────────┴───────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg2']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 731 ┆ 4 │ │ 1282 ┆ 4 │ │ 4331 ┆ 4 │ │ 51 ┆ 3 │ │ 474 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg2']
Where: col(var_reg1)
Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg2 ┆ ┆ 5283 ┆ 5283 ┆ -4.101 ┆ -4.367 ┆ 13.72 ┆ -24.14 ┆ -12.96 ┆ -1.938 ┆ 6.015 ┆ 11.24 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 0 ┆ 3965 ┆ 3965 ┆ -4.199 ┆ -4.462 ┆ 13.65 ┆ -24.07 ┆ -12.92 ┆ -2.104 ┆ 5.767 ┆ 11.04 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 1 ┆ 1318 ┆ 1318 ┆ -3.805 ┆ -4.081 ┆ 13.94 ┆ -24.53 ┆ -13.14 ┆ -1.446 ┆ 6.807 ┆ 11.51 ┆ -55.35 ┆ 24.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
[ 6.73066716e-02 -0.00000000e+00 0.00000000e+00 3.71367698e-01 -0.00000000e+00 -1.07062599e-03 -0.00000000e+00 -3.72161168e-04 -0.00000000e+00 -5.38546132e-03 -4.70160709e-07 0.00000000e+00 -5.77949554e-03 1.71858786e-03 1.74925402e-02] [ 2.55784315e+00 -2.60566968e-14 2.55667576e+00 0.00000000e+00 -0.00000000e+00 -4.58348548e+00 5.69456439e-02 3.94256009e-02 0.00000000e+00 -1.51974623e-02 0.00000000e+00 1.40407334e-04 -9.66030779e+00 2.09934930e+00 -3.24711385e-01 4.41965182e+00]
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
Running variable selection: Method.LASSO
Selected model: ~0+var3+var4+var2+C(var5)+repeat_1+var_reg2+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)
Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT
Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
R2 = 0.6020
┌───────────────────────────┬──────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪══════════╡ │ var3 ┆ 0.004236 │ │ var4 ┆ 0.5434 │ │ var2 ┆ 0.2035 │ │ C(var5)[False] ┆ 2.19 │ │ C(var5)[True] ┆ -2.078 │ │ repeat_1 ┆ -0.0735 │ │ var_reg2 ┆ -0.1184 │ │ unrelated_5 ┆ -0.3512 │ │ unrelated_4 ┆ -0.1332 │ │ unrelated_3 ┆ 0.02857 │ │ unrelated_2 ┆ -0.2581 │ │ unrelated_1 ┆ -0.0735 │ │ var4:var3 ┆ -0.09465 │ │ var3:C(var5)[T.True] ┆ -0.01237 │ │ var4:C(var5)[T.True] ┆ -0.5246 │ │ var4:var3:C(var5)[T.True] ┆ 0.06802 │ │ _Intercept_ ┆ 0.1145 │ └───────────────────────────┴──────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg1']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 4154 ┆ 4 │ │ 275 ┆ 3 │ │ 489 ┆ 3 │ │ 916 ┆ 3 │ │ 1428 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg1']
Where: None
Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg1 ┆ ┆ 12569 ┆ 12569 ┆ 0.5261 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 0 ┆ 10000 ┆ 10000 ┆ 0.5283 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 1 ┆ 2569 ┆ 2569 ┆ 0.5173 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Running variable selection: Method.LASSO
Selected model: ~0+var3+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var4+var4
Imputation using OLS regression with PMM matching
R2 = 0.8037
┌───────────────────────────┬──────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪══════════╡ │ var3 ┆ 0.03414 │ │ var2 ┆ 1.378 │ │ C(var5)[False] ┆ 0.3067 │ │ C(var5)[True] ┆ -0.3067 │ │ repeat_1 ┆ 0.1819 │ │ unrelated_5 ┆ 0.003208 │ │ unrelated_4 ┆ -0.0218 │ │ unrelated_3 ┆ 0.2923 │ │ unrelated_1 ┆ 0.1819 │ │ var4 ┆ 1.279 │ │ var4:var3 ┆ -1.014 │ │ var3:C(var5)[T.True] ┆ -0.0864 │ │ var4:C(var5)[T.True] ┆ 3.087 │ │ var4:var3:C(var5)[T.True] ┆ 0.1758 │ │ _Intercept_ ┆ -0.6994 │ └───────────────────────────┴──────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg2']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 903 ┆ 3 │ │ 1374 ┆ 3 │ │ 1581 ┆ 3 │ │ 2288 ┆ 3 │ │ 3173 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg2']
Where: col(var_reg1)
Where (impute): col(___imp_missing_var_reg2_2)
[-1.45298426e-01 7.94160626e-02 7.65124658e-03 5.31405328e-03 4.05410599e-01 -3.55928686e-16 -3.28522843e-03 -3.08145353e-04 -2.25157225e-03 -1.13798554e-03 -4.41409458e-03 -3.73075092e-06 -1.55053474e-01 3.06222340e-03 -2.17534484e-02 1.17422292e-01] [ 2.67119698e+00 -3.39673395e-14 2.48291904e+00 0.00000000e+00 -4.94651969e-02 -4.89330719e+00 2.34282673e-02 -0.00000000e+00 -1.82999899e-02 -3.46458337e-02 -5.33177612e-02 4.90561207e-04 -9.50912160e+00 1.72184415e+00 -2.67490420e-01 5.26295423e+00]
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg2 ┆ ┆ 6522 ┆ 6522 ┆ -4.08 ┆ -4.428 ┆ 13.82 ┆ -24.55 ┆ -13.14 ┆ -1.997 ┆ 6.061 ┆ 11.33 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 0 ┆ 5216 ┆ 5216 ┆ -4.046 ┆ -4.382 ┆ 13.73 ┆ -24.31 ┆ -12.96 ┆ -1.955 ┆ 5.999 ┆ 11.27 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 1 ┆ 1306 ┆ 1306 ┆ -4.216 ┆ -4.611 ┆ 14.16 ┆ -25.21 ┆ -13.72 ┆ -2.414 ┆ 6.428 ┆ 11.75 ┆ -55.35 ┆ 26.21 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
var_reg1
var_reg2
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate
Dropping var_reg2 from formula
Running variable selection: Method.LASSO
Selected model: ~0+var2+C(var5)+repeat_1+unrelated_5+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var3+var4+var4+var3
Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT
Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
R2 = 0.5358
┌──────────────────────────┬───────────┐ │ Variable ┆ Beta │ ╞══════════════════════════╪═══════════╡ │ var2 ┆ 0.1956 │ │ C(var5)[False] ┆ 2.013 │ │ C(var5)[True] ┆ -2.175 │ │ repeat_1 ┆ -0.1157 │ │ unrelated_5 ┆ -0.4133 │ │ unrelated_3 ┆ -0.1533 │ │ unrelated_1 ┆ -0.1157 │ │ var3 ┆ -0.003659 │ │ var4 ┆ 0.08833 │ │ var3:C(var5)[T.True] ┆ -0.002562 │ │ var4:C(var5)[T.True] ┆ -0.9745 │ │ var4:var3:C(var5)[False] ┆ -0.000146 │ │ var4:var3:C(var5)[True] ┆ 0.03792 │ │ _Intercept_ ┆ -0.176 │ └──────────────────────────┴───────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg1']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 1951 ┆ 4 │ │ 1969 ┆ 4 │ │ 3986 ┆ 4 │ │ 5434 ┆ 4 │ │ 8815 ┆ 4 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg1']
Where: None
Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg1 ┆ ┆ 10000 ┆ 10000 ┆ 0.5243 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 0 ┆ 7431 ┆ 7431 ┆ 0.5231 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 1 ┆ 2569 ┆ 2569 ┆ 0.5278 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Running variable selection: Method.LASSO
Selected model: ~0+var2+C(var5)+repeat_1+var_reg1+var4:var3+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var4+var3+var3+var4+var4+var3
Imputation using OLS regression with PMM matching
R2 = 0.8163
┌───────────────────────────┬──────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪══════════╡ │ var2 ┆ 1.372 │ │ C(var5)[False] ┆ 1.077 │ │ C(var5)[True] ┆ -1.077 │ │ repeat_1 ┆ 0.1153 │ │ unrelated_4 ┆ -1.118 │ │ unrelated_3 ┆ 0.1726 │ │ unrelated_2 ┆ -0.1515 │ │ unrelated_1 ┆ 0.1153 │ │ var4 ┆ 0.6333 │ │ var3 ┆ 0.004142 │ │ var4:var3 ┆ -1.001 │ │ var3:C(var5)[T.True] ┆ 0.01229 │ │ var4:C(var5)[T.True] ┆ 5.903 │ │ var4:var3:C(var5)[T.True] ┆ -0.02945 │ │ _Intercept_ ┆ -0.171 │ └───────────────────────────┴──────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg2']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 3986 ┆ 4 │ │ 5072 ┆ 4 │ │ 1485 ┆ 3 │ │ 2571 ┆ 3 │ │ 4036 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg2']
Where: col(var_reg1)
Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg2 ┆ ┆ 5243 ┆ 5243 ┆ -4.393 ┆ -4.639 ┆ 13.88 ┆ -24.66 ┆ -13.3 ┆ -2.131 ┆ 5.791 ┆ 11.32 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 0 ┆ 3934 ┆ 3934 ┆ -4.275 ┆ -4.515 ┆ 13.73 ┆ -24.31 ┆ -13.0 ┆ -2.144 ┆ 5.784 ┆ 11.19 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 1 ┆ 1309 ┆ 1309 ┆ -4.748 ┆ -5.012 ┆ 14.33 ┆ -26.2 ┆ -13.97 ┆ -2.089 ┆ 5.791 ┆ 11.67 ┆ -55.35 ┆ 24.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
[ 6.73066716e-02 -0.00000000e+00 0.00000000e+00 3.71367698e-01 -0.00000000e+00 -1.07062599e-03 -0.00000000e+00 -3.72161168e-04 -0.00000000e+00 -5.38546132e-03 -4.70160709e-07 0.00000000e+00 -5.77949554e-03 1.71858786e-03 1.74925402e-02] [ 2.61113153e+00 -4.23284371e-14 2.60300215e+00 0.00000000e+00 -0.00000000e+00 -4.62318054e+00 7.24762103e-02 4.17414062e-02 6.79245937e-04 -2.26481611e-02 -0.00000000e+00 1.85148780e-04 -9.63486912e+00 2.06696684e+00 -2.86280726e-01 4.41981803e+00]
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
Running variable selection: Method.LASSO
Selected model: ~0+var3+var4+var2+C(var5)+repeat_1+var_reg2+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+unrelated_1+var3:C(var5)+var4:var3:C(var5)
Imputation using Logit regression with PMM matching
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\utils\validation.py:1406: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:473: ConvergenceWarning: lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT
Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
R2 = 0.5999
┌───────────────────────────┬──────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪══════════╡ │ var3 ┆ 0.01428 │ │ var4 ┆ 0.5788 │ │ var2 ┆ 0.2365 │ │ C(var5)[False] ┆ 2.23 │ │ C(var5)[True] ┆ -2.367 │ │ repeat_1 ┆ -0.08821 │ │ var_reg2 ┆ -0.1217 │ │ unrelated_5 ┆ -0.4341 │ │ unrelated_4 ┆ -0.3981 │ │ unrelated_3 ┆ -0.07955 │ │ unrelated_2 ┆ -0.06151 │ │ unrelated_1 ┆ -0.08821 │ │ var4:var3 ┆ -0.1067 │ │ var3:C(var5)[T.True] ┆ -0.01085 │ │ var4:var3:C(var5)[T.True] ┆ 0.06463 │ │ _Intercept_ ┆ -0.1442 │ └───────────────────────────┴──────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_reg1']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 17 ┆ 4 │ │ 4095 ┆ 4 │ │ 5072 ┆ 4 │ │ 221 ┆ 3 │ │ 956 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg1']
Where: None
Where (impute): col(___imp_missing_var_reg1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg1 ┆ ┆ 12569 ┆ 12569 ┆ 0.5226 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 0 ┆ 10000 ┆ 10000 ┆ 0.5243 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_reg1 ┆ 1 ┆ 2569 ┆ 2569 ┆ 0.5162 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Running variable selection: Method.LASSO
Selected model: ~0+var3+var4+var2+C(var5)+var_reg1+var4:var3+unrelated_5+unrelated_4+unrelated_3+unrelated_2+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)
Imputation using OLS regression with PMM matching
R2 = 0.8185
┌───────────────────────────┬──────────┐ │ Variable ┆ Beta │ ╞═══════════════════════════╪══════════╡ │ var3 ┆ 0.02064 │ │ var4 ┆ 1.082 │ │ var2 ┆ 1.33 │ │ C(var5)[False] ┆ 0.5624 │ │ C(var5)[True] ┆ -0.5624 │ │ unrelated_5 ┆ -0.4497 │ │ unrelated_4 ┆ -0.1819 │ │ unrelated_3 ┆ -0.1995 │ │ unrelated_2 ┆ 0.1406 │ │ var4:var3 ┆ -1.02 │ │ var3:C(var5)[T.True] ┆ -0.01145 │ │ var4:C(var5)[T.True] ┆ 4.584 │ │ var4:var3:C(var5)[T.True] ┆ 0.00607 │ │ _Intercept_ ┆ 0.01821 │ └───────────────────────────┴──────────┘
Finding 10 nearest neighbors on ['___prediction']
[-1.48978929e-01 8.56240169e-02 1.06448209e-02 8.71445031e-03 4.02266676e-01 -5.68046904e-17 -2.17848333e-03 6.38086984e-04 1.57094965e-04 -2.79957055e-03 -5.98274298e-03 -4.20249720e-06 -1.57639625e-01 -0.00000000e+00 -1.64056537e-02 1.14564804e-01] [ 2.88964501e+00 -8.84771027e-14 2.44263969e+00 6.39779260e-02 -5.01120737e-03 -4.96586606e+00 0.00000000e+00 -6.27382024e-02 1.09089409e-02 -1.12658865e-01 -6.71704330e-02 0.00000000e+00 -9.78013377e+00 1.61462238e+00 -2.20516178e-01 5.35854096e+00]
Randomly picking one and donating ['var_reg2']
Most common matches:
shape: (5, 2) ┌──────────────┬─────────┐ │ ___rownumber ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞══════════════╪═════════╡ │ 1160 ┆ 3 │ │ 2049 ┆ 3 │ │ 3438 ┆ 3 │ │ 4080 ┆ 3 │ │ 4243 ┆ 3 │ └──────────────┴─────────┘
Post-imputation statistics for ['var_reg2']
Where: col(var_reg1)
Where (impute): col(___imp_missing_var_reg2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_reg2 ┆ ┆ 6516 ┆ 6516 ┆ -4.36 ┆ -4.734 ┆ 14.02 ┆ -25.03 ┆ -13.61 ┆ -2.154 ┆ 5.784 ┆ 11.37 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 0 ┆ 5213 ┆ 5213 ┆ -4.351 ┆ -4.715 ┆ 13.94 ┆ -24.85 ┆ -13.48 ┆ -2.209 ┆ 5.767 ┆ 11.32 ┆ -55.35 ┆ 26.21 │ │ var_reg2 ┆ 1 ┆ 1303 ┆ 1303 ┆ -4.395 ┆ -4.809 ┆ 14.33 ┆ -25.35 ┆ -14.22 ┆ -2.07 ┆ 5.912 ┆ 11.75 ┆ -55.35 ┆ 24.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_reg1), ignore_nulls=False), col(var_reg2), lit(value=0, dtype=None)).alias(name=var_reg2)
var_reg1
var_reg2
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate
In [7]:
logger.info("Get the results")
_ = df_list = srmi.df_implicates
Get the results
In [8]:
logger.info("\n\nLook at the original")
_ = summary(df_original)
logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary)
logger.info("\n\nLook at the imputes | var_reg1 == 0")
_ = df_list.filter(~nw.col("var_reg1")).pipe(summary)
logger.info("\n\nLook at the imputes | var_reg1 == 1")
_ = df_list.filter(nw.col("var_reg1")).pipe(summary)
Look at the original
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ missing_reg1 ┆ 10,000 ┆ 0 ┆ 0.495784 ┆ 0.289051 ┆ 0.000161 ┆ 0.99991 │ │ missing_reg2 ┆ 10,000 ┆ 0 ┆ 0.502597 ┆ 0.288468 ┆ 0.000006 ┆ 0.999963 │ │ var_reg1 ┆ 10,000 ┆ 0 ┆ 0.5229 ┆ 0.4995 ┆ 0.0 ┆ 1.0 │ │ var_reg2 ┆ 10,000 ┆ 0 ┆ -2.402477 ┆ 10.331745 ┆ -55.354108 ┆ 26.213084 │ └──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
Look at the imputes
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡ │ ___rownumber ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ var_reg1 ┆ 10,000 ┆ 0 ┆ 0.5216 ┆ 0.499558 ┆ 0.0 ┆ 1.0 │ │ var_reg2 ┆ 10,000 ┆ 0 ┆ -2.180802 ┆ 9.844918 ┆ -55.354108 ┆ 26.213084 │ └──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡ │ ___rownumber ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ var_reg1 ┆ 10,000 ┆ 0 ┆ 0.5213 ┆ 0.499571 ┆ 0.0 ┆ 1.0 │ │ var_reg2 ┆ 10,000 ┆ 0 ┆ -2.23131 ┆ 9.939851 ┆ -55.354108 ┆ 26.213084 │ └──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
Look at the imputes | var_reg1 == 0
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡ │ ___rownumber ┆ 4,784 ┆ 0 ┆ 5,033.632734 ┆ 2,891.610847 ┆ 1.0 ┆ 9,997.0 │ │ _row_index_ ┆ 4,784 ┆ 0 ┆ 5,033.632734 ┆ 2,891.610847 ┆ 1.0 ┆ 9,997.0 │ │ index ┆ 4,784 ┆ 0 ┆ 5,033.632734 ┆ 2,891.610847 ┆ 1.0 ┆ 9,997.0 │ │ year ┆ 4,784 ┆ 0 ┆ 2,017.99561 ┆ 1.420329 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 4,784 ┆ 0 ┆ 6.572742 ┆ 3.407715 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 4,784 ┆ 0 ┆ 4.628135 ┆ 3.233716 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 4,784 ┆ 0 ┆ 24.966137 ┆ 13.395857 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 4,784 ┆ 0 ┆ 0.507489 ┆ 0.289287 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 4,784 ┆ 0 ┆ 0.886288 ┆ 0.317495 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 4,784 ┆ 0 ┆ 0.50384 ┆ 0.286077 ┆ 0.000119 ┆ 0.999921 │ │ unrelated_2 ┆ 4,784 ┆ 0 ┆ 0.499727 ┆ 0.285433 ┆ 0.000079 ┆ 0.999528 │ │ unrelated_3 ┆ 4,784 ┆ 0 ┆ 0.501752 ┆ 0.289139 ┆ 0.000139 ┆ 0.99994 │ │ unrelated_4 ┆ 4,784 ┆ 0 ┆ 0.50239 ┆ 0.289704 ┆ 0.000141 ┆ 0.999961 │ │ unrelated_5 ┆ 4,784 ┆ 0 ┆ 0.501731 ┆ 0.290909 ┆ 0.000071 ┆ 0.99984 │ │ repeat_1 ┆ 4,784 ┆ 0 ┆ 0.50384 ┆ 0.286077 ┆ 0.000119 ┆ 0.999921 │ │ var_reg1 ┆ 4,784 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_reg2 ┆ 4,784 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ └──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡ │ ___rownumber ┆ 4,787 ┆ 0 ┆ 5,029.025695 ┆ 2,905.488834 ┆ 1.0 ┆ 9,997.0 │ │ _row_index_ ┆ 4,787 ┆ 0 ┆ 5,029.025695 ┆ 2,905.488834 ┆ 1.0 ┆ 9,997.0 │ │ index ┆ 4,787 ┆ 0 ┆ 5,029.025695 ┆ 2,905.488834 ┆ 1.0 ┆ 9,997.0 │ │ year ┆ 4,787 ┆ 0 ┆ 2,017.99248 ┆ 1.421562 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 4,787 ┆ 0 ┆ 6.585126 ┆ 3.414898 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 4,787 ┆ 0 ┆ 4.609359 ┆ 3.225133 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 4,787 ┆ 0 ┆ 24.976603 ┆ 13.404733 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 4,787 ┆ 0 ┆ 0.508052 ┆ 0.288739 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 4,787 ┆ 0 ┆ 0.885941 ┆ 0.317916 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 4,787 ┆ 0 ┆ 0.503982 ┆ 0.285823 ┆ 0.000119 ┆ 0.999921 │ │ unrelated_2 ┆ 4,787 ┆ 0 ┆ 0.499459 ┆ 0.285463 ┆ 0.000079 ┆ 0.999528 │ │ unrelated_3 ┆ 4,787 ┆ 0 ┆ 0.501814 ┆ 0.289014 ┆ 0.000139 ┆ 0.99994 │ │ unrelated_4 ┆ 4,787 ┆ 0 ┆ 0.500316 ┆ 0.28914 ┆ 0.000141 ┆ 0.999961 │ │ unrelated_5 ┆ 4,787 ┆ 0 ┆ 0.501014 ┆ 0.29115 ┆ 0.000071 ┆ 0.99984 │ │ repeat_1 ┆ 4,787 ┆ 0 ┆ 0.503982 ┆ 0.285823 ┆ 0.000119 ┆ 0.999921 │ │ var_reg1 ┆ 4,787 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_reg2 ┆ 4,787 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ └──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘
Look at the imputes | var_reg1 == 1
┌──────────────┬───────┬─────────────┬─────────────┬──────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪═══════╪═════════════╪═════════════╪══════════════╪════════════╪═══════════╡ │ ___rownumber ┆ 5,216 ┆ 0 ┆ 4,968.19421 ┆ 2,882.486117 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 5,216 ┆ 0 ┆ 4,968.19421 ┆ 2,882.486117 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 5,216 ┆ 0 ┆ 4,968.19421 ┆ 2,882.486117 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 5,216 ┆ 0 ┆ 2,017.97546 ┆ 1.411965 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 5,216 ┆ 0 ┆ 6.459548 ┆ 3.453831 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 5,216 ┆ 0 ┆ 5.299271 ┆ 3.045196 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 5,216 ┆ 0 ┆ 25.23888 ┆ 15.895128 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 5,216 ┆ 0 ┆ 0.503993 ┆ 0.286564 ┆ 0.000104 ┆ 0.999885 │ │ var5 ┆ 5,216 ┆ 0 ┆ 0.145514 ┆ 0.352652 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 5,216 ┆ 0 ┆ 0.501172 ┆ 0.290459 ┆ 0.000248 ┆ 0.999997 │ │ unrelated_2 ┆ 5,216 ┆ 0 ┆ 0.500452 ┆ 0.289673 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 5,216 ┆ 0 ┆ 0.496812 ┆ 0.288419 ┆ 0.000129 ┆ 0.999622 │ │ unrelated_4 ┆ 5,216 ┆ 0 ┆ 0.499064 ┆ 0.287791 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 5,216 ┆ 0 ┆ 0.496036 ┆ 0.287198 ┆ 0.000181 ┆ 0.999867 │ │ repeat_1 ┆ 5,216 ┆ 0 ┆ 0.501172 ┆ 0.290459 ┆ 0.000248 ┆ 0.999997 │ │ var_reg1 ┆ 5,216 ┆ 0 ┆ 1.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 │ │ var_reg2 ┆ 5,216 ┆ 0 ┆ -4.180985 ┆ 13.321794 ┆ -55.354108 ┆ 26.213084 │ └──────────────┴───────┴─────────────┴─────────────┴──────────────┴────────────┴───────────┘
┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪════════════╪═══════════╡ │ ___rownumber ┆ 5,213 ┆ 0 ┆ 4,972.387109 ┆ 2,869.727124 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 5,213 ┆ 0 ┆ 4,972.387109 ┆ 2,869.727124 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 5,213 ┆ 0 ┆ 4,972.387109 ┆ 2,869.727124 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 5,213 ┆ 0 ┆ 2,017.978323 ┆ 1.410855 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 5,213 ┆ 0 ┆ 6.44811 ┆ 3.446923 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 5,213 ┆ 0 ┆ 5.3169 ┆ 3.049492 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 5,213 ┆ 0 ┆ 25.229426 ┆ 15.889739 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 5,213 ┆ 0 ┆ 0.503474 ┆ 0.287062 ┆ 0.000104 ┆ 0.999885 │ │ var5 ┆ 5,213 ┆ 0 ┆ 0.145406 ┆ 0.352543 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 5,213 ┆ 0 ┆ 0.50104 ┆ 0.290689 ┆ 0.000248 ┆ 0.999997 │ │ unrelated_2 ┆ 5,213 ┆ 0 ┆ 0.500698 ┆ 0.289648 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 5,213 ┆ 0 ┆ 0.496752 ┆ 0.288533 ┆ 0.000129 ┆ 0.999622 │ │ unrelated_4 ┆ 5,213 ┆ 0 ┆ 0.500967 ┆ 0.288319 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 5,213 ┆ 0 ┆ 0.496691 ┆ 0.286984 ┆ 0.000181 ┆ 0.999867 │ │ repeat_1 ┆ 5,213 ┆ 0 ┆ 0.50104 ┆ 0.290689 ┆ 0.000248 ┆ 0.999997 │ │ var_reg1 ┆ 5,213 ┆ 0 ┆ 1.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 │ │ var_reg2 ┆ 5,213 ┆ 0 ┆ -4.28028 ┆ 13.445184 ┆ -55.354108 ┆ 26.213084 │ └──────────────┴───────┴─────────────┴──────────────┴──────────────┴────────────┴───────────┘