In [1]:
import sys
import os
from pathlib import Path
import narwhals as nw
import polars as pl
import polars.selectors as cs
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.imputation.selection import Selection
import survey_kit.imputation.utilities.lightgbm_wrapper as rep_lgbm
from survey_kit.imputation.utilities.lightgbm_wrapper import Tuner_optuna
from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list
In [2]:
# Draw some random data
n_rows = 10_000
impute_share = 0.25
df = (
RandomData(n_rows=n_rows, seed=32565437)
.index("index")
.integer("year", 2016, 2020)
.integer("month", 1, 12)
.integer("var2", 0, 10)
.integer("var3", 0, 50)
.float("var4", 0, 1)
.integer("var5", 0, 1)
.float("unrelated_1", 0, 1)
.float("unrelated_2", 0, 1)
.float("unrelated_3", 0, 1)
.float("unrelated_4", 0, 1)
.float("unrelated_5", 0, 1)
.np_distribution("epsilon_gbm1", "normal", scale=5)
.np_distribution("epsilon_gbm2", "normal", scale=5)
.np_distribution("epsilon_gbm3", "normal", scale=5)
.float("missing_gbm1", 0, 1)
.float("missing_gbm2", 0, 1)
.float("missing_gbm3", 0, 1)
.to_df()
)
# Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")
c_e_gbm1 = pl.col("epsilon_gbm1")
c_e_gbm2 = pl.col("epsilon_gbm2")
# Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")
logger.info("var_gbm1 is binary and conditional on other variables")
c_gbm1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_gbm1) > 0).alias("var_gbm1")
logger.info("var_gbm2 is != 0 only if var_gbm1 == True")
c_gbm2 = (
pl.when(pl.col("var_gbm1"))
.then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_gbm2)))
.otherwise(pl.lit(0))
.alias("var_gbm2")
)
c_gbm3 = (
pl.when(pl.col("var_gbm1"))
.then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_gbm2)))
.otherwise(pl.lit(0))
.alias("var_gbm3")
)
# Create a bunch of variables that are functions of the variables created above
df = (
df.with_columns(c_gbm1)
.with_columns(c_gbm2, c_gbm3)
.drop(columns_from_list(df=df, columns="epsilon*"))
.with_row_index(name="_row_index_")
)
df_original = df
# Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["gbm"]:
for i in range(1, 4):
vari = f"var_{prefixi}{i}"
missingi = f"missing_{prefixi}{i}"
clear_missing.append(
pl.when(pl.col(missingi) < impute_share)
.then(pl.lit(None))
.otherwise(pl.col(vari))
.alias(vari)
)
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))
# Make a fully collinear var for testing
df = df.with_columns(pl.col("unrelated_1").alias("repeat_1"))
summary(df)
var_gbm1 is binary and conditional on other variables
var_gbm2 is != 0 only if var_gbm1 == True
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ var_gbm1 ┆ 10,000 ┆ 2,483 ┆ 0.526008 ┆ 0.499356 ┆ 0.0 ┆ 1.0 │ │ var_gbm2 ┆ 10,000 ┆ 2,464 ┆ -2.393041 ┆ 10.384672 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm3 ┆ 10,000 ┆ 2,596 ┆ -2.500425 ┆ 10.365353 ┆ -55.354108 ┆ 26.213084 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ └─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘
Out[2]:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)
SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] UNION PLAN 0: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["_row_index_".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("_row_index__min").alias("min"), col("_row_index__rawn_missing").alias("n (missing)"), col("_row_index__max").alias("max"), col("_row_index__std").alias("std"), col("_row_index__rawn").alias("n"), col("_row_index__mean").alias("mean")] SELECT [col("___index___"), col("_row_index__min"), col("_row_index__rawn_missing"), col("_row_index__max"), col("_row_index__std"), col("_row_index__rawn"), col("_row_index__mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 1: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["index".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("index_min").alias("min"), col("index_rawn_missing").alias("n (missing)"), col("index_max").alias("max"), col("index_std").alias("std"), col("index_rawn").alias("n"), col("index_mean").alias("mean")] SELECT [col("___index___"), col("index_min"), col("index_rawn_missing"), col("index_max"), col("index_std"), col("index_rawn"), col("index_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 2: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["year".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("year_min").alias("min"), col("year_rawn_missing").alias("n (missing)"), col("year_max").alias("max"), col("year_std").alias("std"), col("year_rawn").alias("n"), col("year_mean").alias("mean")] SELECT [col("___index___"), col("year_min"), col("year_rawn_missing"), col("year_max"), col("year_std"), col("year_rawn"), col("year_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 3: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["month".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("month_min").alias("min"), col("month_rawn_missing").alias("n (missing)"), col("month_max").alias("max"), col("month_std").alias("std"), col("month_rawn").alias("n"), col("month_mean").alias("mean")] SELECT [col("___index___"), col("month_min"), col("month_rawn_missing"), col("month_max"), col("month_std"), col("month_rawn"), col("month_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 4: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var2".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var2_min").alias("min"), col("var2_rawn_missing").alias("n (missing)"), col("var2_max").alias("max"), col("var2_std").alias("std"), col("var2_rawn").alias("n"), col("var2_mean").alias("mean")] SELECT [col("___index___"), col("var2_min"), col("var2_rawn_missing"), col("var2_max"), col("var2_std"), col("var2_rawn"), col("var2_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 5: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var3".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var3_min").alias("min"), col("var3_rawn_missing").alias("n (missing)"), col("var3_max").alias("max"), col("var3_std").alias("std"), col("var3_rawn").alias("n"), col("var3_mean").alias("mean")] SELECT [col("___index___"), col("var3_min"), col("var3_rawn_missing"), col("var3_max"), col("var3_std"), col("var3_rawn"), col("var3_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 6: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var4".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var4_min").alias("min"), col("var4_rawn_missing").alias("n (missing)"), col("var4_max").alias("max"), col("var4_std").alias("std"), col("var4_rawn").alias("n"), col("var4_mean").alias("mean")] SELECT [col("___index___"), col("var4_min"), col("var4_rawn_missing"), col("var4_max"), col("var4_std"), col("var4_rawn"), col("var4_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 7: WITH_COLUMNS: [col("n (missing)").cast(Int16), col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var5".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var5_min").alias("min"), col("var5_rawn_missing").alias("n (missing)"), col("var5_max").alias("max"), col("var5_std").alias("std"), col("var5_rawn").alias("n"), col("var5_mean").alias("mean")] SELECT [col("___index___"), col("var5_min"), col("var5_rawn_missing"), col("var5_max"), col("var5_std"), col("var5_rawn"), col("var5_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 8: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_1".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("unrelated_1_min").alias("min"), col("unrelated_1_rawn_missing").alias("n (missing)"), col("unrelated_1_max").alias("max"), col("unrelated_1_std").alias("std"), col("unrelated_1_rawn").alias("n"), col("unrelated_1_mean").alias("mean")] SELECT [col("___index___"), col("unrelated_1_min"), col("unrelated_1_rawn_missing"), col("unrelated_1_max"), col("unrelated_1_std"), col("unrelated_1_rawn"), col("unrelated_1_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 9: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_2".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("unrelated_2_min").alias("min"), col("unrelated_2_rawn_missing").alias("n (missing)"), col("unrelated_2_max").alias("max"), col("unrelated_2_std").alias("std"), col("unrelated_2_rawn").alias("n"), col("unrelated_2_mean").alias("mean")] SELECT [col("___index___"), col("unrelated_2_min"), col("unrelated_2_rawn_missing"), col("unrelated_2_max"), col("unrelated_2_std"), col("unrelated_2_rawn"), col("unrelated_2_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 10: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_3".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("unrelated_3_min").alias("min"), col("unrelated_3_rawn_missing").alias("n (missing)"), col("unrelated_3_max").alias("max"), col("unrelated_3_std").alias("std"), col("unrelated_3_rawn").alias("n"), col("unrelated_3_mean").alias("mean")] SELECT [col("___index___"), col("unrelated_3_min"), col("unrelated_3_rawn_missing"), col("unrelated_3_max"), col("unrelated_3_std"), col("unrelated_3_rawn"), col("unrelated_3_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 11: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_4".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("unrelated_4_min").alias("min"), col("unrelated_4_rawn_missing").alias("n (missing)"), col("unrelated_4_max").alias("max"), col("unrelated_4_std").alias("std"), col("unrelated_4_rawn").alias("n"), col("unrelated_4_mean").alias("mean")] SELECT [col("___index___"), col("unrelated_4_min"), col("unrelated_4_rawn_missing"), col("unrelated_4_max"), col("unrelated_4_std"), col("unrelated_4_rawn"), col("unrelated_4_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 12: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["unrelated_5".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("unrelated_5_min").alias("min"), col("unrelated_5_rawn_missing").alias("n (missing)"), col("unrelated_5_max").alias("max"), col("unrelated_5_std").alias("std"), col("unrelated_5_rawn").alias("n"), col("unrelated_5_mean").alias("mean")] SELECT [col("___index___"), col("unrelated_5_min"), col("unrelated_5_rawn_missing"), col("unrelated_5_max"), col("unrelated_5_std"), col("unrelated_5_rawn"), col("unrelated_5_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 13: WITH_COLUMNS: [col("min").strict_cast(Float64), col("max").strict_cast(Float64)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var_gbm1".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var_gbm1_min").alias("min"), col("var_gbm1_rawn_missing").alias("n (missing)"), col("var_gbm1_max").alias("max"), col("var_gbm1_std").alias("std"), col("var_gbm1_rawn").alias("n"), col("var_gbm1_mean").alias("mean")] SELECT [col("___index___"), col("var_gbm1_min"), col("var_gbm1_rawn_missing"), col("var_gbm1_max"), col("var_gbm1_std"), col("var_gbm1_rawn"), col("var_gbm1_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 14: SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var_gbm2".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var_gbm2_min").alias("min"), col("var_gbm2_rawn_missing").alias("n (missing)"), col("var_gbm2_max").alias("max"), col("var_gbm2_std").alias("std"), col("var_gbm2_rawn").alias("n"), col("var_gbm2_mean").alias("mean")] SELECT [col("___index___"), col("var_gbm2_min"), col("var_gbm2_rawn_missing"), col("var_gbm2_max"), col("var_gbm2_std"), col("var_gbm2_rawn"), col("var_gbm2_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 15: SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["var_gbm3".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("var_gbm3_min").alias("min"), col("var_gbm3_rawn_missing").alias("n (missing)"), col("var_gbm3_max").alias("max"), col("var_gbm3_std").alias("std"), col("var_gbm3_rawn").alias("n"), col("var_gbm3_mean").alias("mean")] SELECT [col("___index___"), col("var_gbm3_min"), col("var_gbm3_rawn_missing"), col("var_gbm3_max"), col("var_gbm3_std"), col("var_gbm3_rawn"), col("var_gbm3_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS PLAN 16: WITH_COLUMNS: [col("n (missing)").cast(Int16)] SELECT [col("Variable"), col("n"), col("n (missing)"), col("mean"), col("std"), col("min"), col("max")] WITH_COLUMNS: ["repeat_1".alias("Variable")] SELECT [col("min"), col("n (missing)"), col("max"), col("std"), col("n"), col("mean")] SELECT [col("___index___"), col("repeat_1_min").alias("min"), col("repeat_1_rawn_missing").alias("n (missing)"), col("repeat_1_max").alias("max"), col("repeat_1_std").alias("std"), col("repeat_1_rawn").alias("n"), col("repeat_1_mean").alias("mean")] SELECT [col("___index___"), col("repeat_1_min"), col("repeat_1_rawn_missing"), col("repeat_1_max"), col("repeat_1_std"), col("repeat_1_rawn"), col("repeat_1_mean")] DF ["___index___", "_row_index__rawn", "_row_index__mean", "_row_index__std", ...]; PROJECT */103 COLUMNS END UNION
In [3]:
logger.info("Define some dummy functions to run after imputation of 2")
# Test a simple pre-post function
# These would get run gets run in each iteration (in each implicate)
# before (preFunctions) or after (postFunctions) this variable is imputed
# Notes for these functions:
# 1) No type hints on imported package types (will throw an error)
# i.e. no df:pl.DataFrame or -> pl.DataFrame
# 2) Must be completely self-contained (i.e. all imports within the function)
# This has to do with how it gets saved and loaded in async calls
# 3) Effectively, you have to assume it'll be called
# in an environment with no imports before it
def square_var(df, var_to_square: str, name: str):
import narwhals as nw
return (
nw.from_native(df)
.with_columns((nw.col(var_to_square) ** 2).alias(name))
.to_native()
)
def recalculate_interaction(df, var1: str, var2: str, name: str):
import narwhals as nw
return (
nw.from_native(df)
.with_columns((nw.col(var1) * nw.col(var2)).alias(name))
.to_native()
)
Define some dummy functions to run after imputation of 2
In [4]:
logger.info("Set up hyperparameter tuning")
tuner = Tuner_optuna(
n_trials=50, objective=rep_lgbm.Tuner.Objectives.mae, test_size=0.25
)
logger.info(" Set the tuner parameters to the defaults")
tuner.parameters()
logger.info(" Then specify ranges to check between as follow")
tuner.hyperparameters["num_leaves"] = [2, 256]
tuner.hyperparameters["max_depth"] = [2, 256]
tuner.hyperparameters["min_data_in_leaf"] = [10, 250]
tuner.hyperparameters["num_iterations"] = [25, 200]
tuner.hyperparameters["bagging_fraction"] = [0.5, 1]
tuner.hyperparameters["bagging_freq"] = [1, 5]
vars_impute = []
Set up hyperparameter tuning
Set the tuner parameters to the defaults
Setting default optuna sampler: TPESampler
[I 2025-11-07 14:22:32,308] A new study created in memory with name: no-name-66ac4f39-3d4e-4306-a054-c37b1c0b6041
Then specify ranges to check between as follow
In [5]:
logger.info("Impute the boolean variable (var_gbm1)")
logger.info(" to the default setup for predicted mean matching")
logger.info(" using lightgbm")
logger.info(" (you can pass a formula, but you don't need to)")
logger.info("First, set up the lightgbm parameters")
logger.info(" This says, do hyperparameter tuning first (tune)")
logger.info(" Redo it at each run (tune_overwrite)")
logger.info(" And sets the lightgbm parameter defaults (parameters) that the tuning can overwrite")
parameters_lgbm1 = Parameters.LightGBM(
tune=True,
tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
tuner=tuner,
tune_overwrite=True,
parameters={
"objective": "binary",
"num_leaves": 32,
"min_data_in_leaf": 20,
"num_iterations": 100,
"test_size": 0.2,
"boosting": "gbdt",
"categorical_feature": ["var5"],
"verbose": -1, # ,
},
error=Parameters.ErrorDraw.pmm,
)
logger.info("Actually define the variable and the model")
v_gbm1 = Variable(
impute_var="var_gbm1",
model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
modeltype=Variable.ModelType.LightGBM,
parameters=parameters_lgbm1
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_gbm1)
logger.info("Impute the continuous variable (var_gbm2) ")
logger.info(" conditional on var_gbm1, using narwhals (nw.col('var_gbm1'))")
logger.info(" as well as a post-processing edit to set var_gbm2=0 when var_gbm1==0")
logger.info(" and some other random post-processing")
logger.info("Different parameters for the continuous variable")
parameters_lgbm2 = Parameters.LightGBM(
tune=True,
tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
tuner=tuner,
tune_overwrite=True,
parameters={
"objective": "regression",
"num_leaves": 32,
"min_data_in_leaf": 20,
"num_iterations": 100,
"test_size": 0.2,
"boosting": "gbdt",
"categorical_feature": ["var5"],
"verbose": -1, # ,
},
error=Parameters.ErrorDraw.pmm,
)
v_gbm2 = Variable(
impute_var="var_gbm2",
Where=nw.col("var_gbm1"),
# Needed in case var_gbm1 changes between iterations
Where_predict=(nw.col("var_gbm2") != 0),
model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
modeltype=Variable.ModelType.LightGBM,
parameters=parameters_lgbm2,
postFunctions=[
(
nw.when(nw.col("var_gbm1"))
.then(nw.col("var_gbm2"))
.otherwise(nw.lit(0))
.alias("var_gbm2")
),
Variable.PrePost.Function(
recalculate_interaction,
parameters=dict(
var1="var_gbm1",
var2="var_gbm2",
name="var_gbm12"
),
),
Variable.PrePost.Function(
square_var,
parameters=dict(
var_to_square="var_gbm2",
name="var_gbm2_sq"
)
),
]
)
vars_impute.append(v_gbm2)
logger.info("Now do one with the quantile-regression lightgbm")
logger.info(" To do this, pass quantiles and set objective='quantile'")
parameters_lgbm3 = Parameters.LightGBM(
tune=True,
tune_hyperparameter_path=f"{config.data_root}/tuner_outputs",
tuner=tuner,
tune_overwrite=True,
quantiles=[0.25,0.5,0.75],
parameters={
"objective": "quantile",
"num_leaves": 32,
"min_data_in_leaf": 20,
"num_iterations": 100,
"test_size": 0.2,
"boosting": "gbdt",
"categorical_feature": ["var5"],
"verbose": -1, # ,
},
error=Parameters.ErrorDraw.pmm,
)
v_gbm3 = Variable(
impute_var="var_gbm3",
Where=nw.col("var_gbm1"),
# Needed in case var_gbm1 changes between iterations
Where_predict=(nw.col("var_gbm3") != 0),
model=["var_*", "var4", "var3", "var5", "unrelated_*", "repeat_*"],
modeltype=Variable.ModelType.LightGBM,
parameters=parameters_lgbm3,
postFunctions=[
(
nw.when(nw.col("var_gbm1"))
.then(nw.col("var_gbm3"))
.otherwise(nw.lit(0))
.alias("var_gbm3")
)
]
)
vars_impute.append(v_gbm3)
Impute the boolean variable (var_gbm1)
to the default setup for predicted mean matching
using lightgbm
(you can pass a formula, but you don't need to)
First, set up the lightgbm parameters
This says, do hyperparameter tuning first (tune)
Redo it at each run (tune_overwrite)
And sets the lightgbm parameter defaults (parameters) that the tuning can overwrite
Actually define the variable and the model
Add the variable to the list to be imputed
Impute the continuous variable (var_gbm2)
conditional on var_gbm1, using narwhals (nw.col('var_gbm1'))
as well as a post-processing edit to set var_gbm2=0 when var_gbm1==0
and some other random post-processing
Different parameters for the continuous variable
Now do one with the quantile-regression lightgbm
To do this, pass quantiles and set objective='quantile'
In [6]:
logger.info("Set up the imputation")
srmi = SRMI(
df=df,
variables=vars_impute,
n_implicates=2,
n_iterations=2,
parallel=False,
index=["index"],
modeltype=Variable.ModelType.pmm,
bayesian_bootstrap=True,
path_model=f"{config.path_temp_files}/py_srmi_test_gbm",
force_start=True,
)
Set up the imputation
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi
In [7]:
logger.info("Run it")
srmi.run()
logger.info("It's automatically saved and can be loaded with (see path_model above):")
logger.info("path_model = f'{config.path_temp_files}/py_srmi_test_gbm'")
logger.info("srmi = SRMI.load(path_model)")
Run it
Variable selection before SRMI run, if necessary
var_gbm1: Method.No
var_gbm2: Method.No
var_gbm3: Method.No
Hyperparameter tuning before SRMI run, if necessary
[I 2025-11-07 14:22:32,575] Trial 0 finished with value: 0.006293613988163579 and parameters: {'num_leaves': 38, 'max_depth': 177, 'min_data_in_leaf': 81, 'num_iterations': 94, 'bagging_fraction': 0.6071214413167514, 'bagging_freq': 2}. Best is trial 0 with value: 0.006293613988163579.
[I 2025-11-07 14:22:32,626] Trial 1 finished with value: 0.02030944315858492 and parameters: {'num_leaves': 83, 'max_depth': 27, 'min_data_in_leaf': 177, 'num_iterations': 41, 'bagging_fraction': 0.9889998163277448, 'bagging_freq': 3}. Best is trial 0 with value: 0.006293613988163579.
[I 2025-11-07 14:22:32,679] Trial 2 finished with value: 0.0079340733621239 and parameters: {'num_leaves': 12, 'max_depth': 158, 'min_data_in_leaf': 93, 'num_iterations': 77, 'bagging_fraction': 0.6507690841429046, 'bagging_freq': 1}. Best is trial 0 with value: 0.006293613988163579.
[I 2025-11-07 14:22:32,705] Trial 3 finished with value: 0.05234521224018576 and parameters: {'num_leaves': 82, 'max_depth': 230, 'min_data_in_leaf': 248, 'num_iterations': 30, 'bagging_fraction': 0.6575054481666343, 'bagging_freq': 5}. Best is trial 0 with value: 0.006293613988163579.
[I 2025-11-07 14:22:33,008] Trial 4 finished with value: 0.00461504982766553 and parameters: {'num_leaves': 211, 'max_depth': 30, 'min_data_in_leaf': 22, 'num_iterations': 97, 'bagging_fraction': 0.7923256552747111, 'bagging_freq': 5}. Best is trial 4 with value: 0.00461504982766553.
[I 2025-11-07 14:22:33,147] Trial 5 finished with value: 0.004199373956016338 and parameters: {'num_leaves': 21, 'max_depth': 44, 'min_data_in_leaf': 27, 'num_iterations': 149, 'bagging_fraction': 0.6462482222199595, 'bagging_freq': 1}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:33,205] Trial 6 finished with value: 0.010621019610967883 and parameters: {'num_leaves': 138, 'max_depth': 163, 'min_data_in_leaf': 204, 'num_iterations': 72, 'bagging_fraction': 0.5384431993550269, 'bagging_freq': 5}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:33,271] Trial 7 finished with value: 0.010563430531137228 and parameters: {'num_leaves': 197, 'max_depth': 92, 'min_data_in_leaf': 177, 'num_iterations': 62, 'bagging_fraction': 0.7192646486876234, 'bagging_freq': 5}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:33,350] Trial 8 finished with value: 0.015874280406748566 and parameters: {'num_leaves': 243, 'max_depth': 151, 'min_data_in_leaf': 64, 'num_iterations': 43, 'bagging_fraction': 0.6430830504379776, 'bagging_freq': 3}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:33,441] Trial 9 finished with value: 0.009616444926467413 and parameters: {'num_leaves': 31, 'max_depth': 210, 'min_data_in_leaf': 12, 'num_iterations': 48, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:33,638] Trial 10 finished with value: 0.00493166101685885 and parameters: {'num_leaves': 135, 'max_depth': 86, 'min_data_in_leaf': 126, 'num_iterations': 160, 'bagging_fraction': 0.8819389978743449, 'bagging_freq': 1}. Best is trial 5 with value: 0.004199373956016338.
[I 2025-11-07 14:22:34,039] Trial 11 finished with value: 0.00373953961413663 and parameters: {'num_leaves': 190, 'max_depth': 12, 'min_data_in_leaf': 11, 'num_iterations': 138, 'bagging_fraction': 0.8078431753255333, 'bagging_freq': 4}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:34,248] Trial 12 finished with value: 0.004339122356902152 and parameters: {'num_leaves': 176, 'max_depth': 7, 'min_data_in_leaf': 46, 'num_iterations': 147, 'bagging_fraction': 0.853507574513817, 'bagging_freq': 4}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:34,591] Trial 13 finished with value: 0.004413928894912943 and parameters: {'num_leaves': 84, 'max_depth': 72, 'min_data_in_leaf': 45, 'num_iterations': 199, 'bagging_fraction': 0.5041106423996129, 'bagging_freq': 4}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:34,792] Trial 14 finished with value: 0.00537742265437493 and parameters: {'num_leaves': 249, 'max_depth': 52, 'min_data_in_leaf': 123, 'num_iterations': 137, 'bagging_fraction': 0.840661667279654, 'bagging_freq': 3}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:34,968] Trial 15 finished with value: 0.004371619670149031 and parameters: {'num_leaves': 166, 'max_depth': 6, 'min_data_in_leaf': 10, 'num_iterations': 122, 'bagging_fraction': 0.7166979219126673, 'bagging_freq': 4}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:35,304] Trial 16 finished with value: 0.005054464106091356 and parameters: {'num_leaves': 107, 'max_depth': 114, 'min_data_in_leaf': 89, 'num_iterations': 175, 'bagging_fraction': 0.9492733295644904, 'bagging_freq': 2}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:35,563] Trial 17 finished with value: 0.003960244298876707 and parameters: {'num_leaves': 52, 'max_depth': 44, 'min_data_in_leaf': 44, 'num_iterations': 177, 'bagging_fraction': 0.5832980750755009, 'bagging_freq': 1}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:35,866] Trial 18 finished with value: 0.004353149056229679 and parameters: {'num_leaves': 52, 'max_depth': 115, 'min_data_in_leaf': 50, 'num_iterations': 187, 'bagging_fraction': 0.5781190965023928, 'bagging_freq': 4}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:36,203] Trial 19 finished with value: 0.0049043893450392275 and parameters: {'num_leaves': 214, 'max_depth': 57, 'min_data_in_leaf': 108, 'num_iterations': 169, 'bagging_fraction': 0.8015641272446892, 'bagging_freq': 2}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:36,379] Trial 20 finished with value: 0.006695461279110995 and parameters: {'num_leaves': 159, 'max_depth': 5, 'min_data_in_leaf': 152, 'num_iterations': 120, 'bagging_fraction': 0.9166859696377415, 'bagging_freq': 3}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:36,495] Trial 21 finished with value: 0.006113946532124947 and parameters: {'num_leaves': 8, 'max_depth': 39, 'min_data_in_leaf': 32, 'num_iterations': 148, 'bagging_fraction': 0.6953246704052042, 'bagging_freq': 1}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:36,720] Trial 22 finished with value: 0.005001451067591096 and parameters: {'num_leaves': 54, 'max_depth': 68, 'min_data_in_leaf': 67, 'num_iterations': 134, 'bagging_fraction': 0.577584083188826, 'bagging_freq': 1}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:37,205] Trial 23 finished with value: 0.0037789187278397672 and parameters: {'num_leaves': 103, 'max_depth': 34, 'min_data_in_leaf': 29, 'num_iterations': 164, 'bagging_fraction': 0.6213175051289077, 'bagging_freq': 1}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:37,551] Trial 24 finished with value: 0.00457744302530845 and parameters: {'num_leaves': 109, 'max_depth': 27, 'min_data_in_leaf': 63, 'num_iterations': 178, 'bagging_fraction': 0.5350277728945482, 'bagging_freq': 2}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:38,038] Trial 25 finished with value: 0.003977273953594933 and parameters: {'num_leaves': 118, 'max_depth': 93, 'min_data_in_leaf': 38, 'num_iterations': 198, 'bagging_fraction': 0.5985625792598168, 'bagging_freq': 1}. Best is trial 11 with value: 0.00373953961413663.
[I 2025-11-07 14:22:38,467] Trial 26 finished with value: 0.003595277821078484 and parameters: {'num_leaves': 65, 'max_depth': 20, 'min_data_in_leaf': 10, 'num_iterations': 162, 'bagging_fraction': 0.7589445430378556, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:39,029] Trial 27 finished with value: 0.0037965257659618246 and parameters: {'num_leaves': 147, 'max_depth': 16, 'min_data_in_leaf': 15, 'num_iterations': 158, 'bagging_fraction': 0.756635795646212, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:39,354] Trial 28 finished with value: 0.004656774471421873 and parameters: {'num_leaves': 186, 'max_depth': 71, 'min_data_in_leaf': 65, 'num_iterations': 131, 'bagging_fraction': 0.8072148913420076, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:39,533] Trial 29 finished with value: 0.006115649170889443 and parameters: {'num_leaves': 72, 'max_depth': 194, 'min_data_in_leaf': 77, 'num_iterations': 97, 'bagging_fraction': 0.6953081958377136, 'bagging_freq': 3}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:39,737] Trial 30 finished with value: 0.005723325846994529 and parameters: {'num_leaves': 100, 'max_depth': 129, 'min_data_in_leaf': 102, 'num_iterations': 110, 'bagging_fraction': 0.8341726302673887, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:40,266] Trial 31 finished with value: 0.003713537124780219 and parameters: {'num_leaves': 153, 'max_depth': 17, 'min_data_in_leaf': 10, 'num_iterations': 160, 'bagging_fraction': 0.7568989850011942, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:40,720] Trial 32 finished with value: 0.003835200507887722 and parameters: {'num_leaves': 121, 'max_depth': 22, 'min_data_in_leaf': 31, 'num_iterations': 161, 'bagging_fraction': 0.7656307055803864, 'bagging_freq': 3}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:41,329] Trial 33 finished with value: 0.003878830248534503 and parameters: {'num_leaves': 156, 'max_depth': 23, 'min_data_in_leaf': 10, 'num_iterations': 142, 'bagging_fraction': 0.7375449661019617, 'bagging_freq': 4}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:41,415] Trial 34 finished with value: 0.010267898695211825 and parameters: {'num_leaves': 93, 'max_depth': 2, 'min_data_in_leaf': 27, 'num_iterations': 185, 'bagging_fraction': 0.682346245092663, 'bagging_freq': 5}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:41,887] Trial 35 finished with value: 0.004207679104339624 and parameters: {'num_leaves': 223, 'max_depth': 33, 'min_data_in_leaf': 56, 'num_iterations': 166, 'bagging_fraction': 0.891198149863283, 'bagging_freq': 3}. Best is trial 26 with value: 0.003595277821078484.
[I 2025-11-07 14:22:42,372] Trial 36 finished with value: 0.0035836660043942837 and parameters: {'num_leaves': 187, 'max_depth': 58, 'min_data_in_leaf': 24, 'num_iterations': 155, 'bagging_fraction': 0.7790472750239636, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:42,537] Trial 37 finished with value: 0.006943833603274368 and parameters: {'num_leaves': 192, 'max_depth': 55, 'min_data_in_leaf': 243, 'num_iterations': 153, 'bagging_fraction': 0.7864824490040053, 'bagging_freq': 5}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:42,781] Trial 38 finished with value: 0.004872274824714764 and parameters: {'num_leaves': 174, 'max_depth': 240, 'min_data_in_leaf': 80, 'num_iterations': 125, 'bagging_fraction': 0.819415916649063, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:43,059] Trial 39 finished with value: 0.004122748203759727 and parameters: {'num_leaves': 232, 'max_depth': 48, 'min_data_in_leaf': 19, 'num_iterations': 105, 'bagging_fraction': 0.8640664508276489, 'bagging_freq': 5}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:43,195] Trial 40 finished with value: 0.005713792468500361 and parameters: {'num_leaves': 196, 'max_depth': 14, 'min_data_in_leaf': 153, 'num_iterations': 143, 'bagging_fraction': 0.785208881972336, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:43,579] Trial 41 finished with value: 0.003926609244013986 and parameters: {'num_leaves': 208, 'max_depth': 31, 'min_data_in_leaf': 27, 'num_iterations': 168, 'bagging_fraction': 0.7324691075928659, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:44,044] Trial 42 finished with value: 0.004264133352820557 and parameters: {'num_leaves': 147, 'max_depth': 36, 'min_data_in_leaf': 21, 'num_iterations': 153, 'bagging_fraction': 0.6327928638546756, 'bagging_freq': 5}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:44,479] Trial 43 finished with value: 0.0038495556031894755 and parameters: {'num_leaves': 67, 'max_depth': 18, 'min_data_in_leaf': 36, 'num_iterations': 187, 'bagging_fraction': 0.6715736613418856, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:45,002] Trial 44 finished with value: 0.003887009328010064 and parameters: {'num_leaves': 129, 'max_depth': 62, 'min_data_in_leaf': 20, 'num_iterations': 156, 'bagging_fraction': 0.7707274338715906, 'bagging_freq': 3}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:45,170] Trial 45 finished with value: 0.004327082058649623 and parameters: {'num_leaves': 24, 'max_depth': 83, 'min_data_in_leaf': 40, 'num_iterations': 139, 'bagging_fraction': 0.7198871973326627, 'bagging_freq': 5}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:45,442] Trial 46 finished with value: 0.004249659215031347 and parameters: {'num_leaves': 180, 'max_depth': 42, 'min_data_in_leaf': 54, 'num_iterations': 128, 'bagging_fraction': 0.7462890352620398, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:45,919] Trial 47 finished with value: 0.003642558996562752 and parameters: {'num_leaves': 165, 'max_depth': 12, 'min_data_in_leaf': 10, 'num_iterations': 163, 'bagging_fraction': 0.8273959786089049, 'bagging_freq': 3}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:46,506] Trial 48 finished with value: 0.0038036546122467035 and parameters: {'num_leaves': 168, 'max_depth': 13, 'min_data_in_leaf': 10, 'num_iterations': 174, 'bagging_fraction': 0.8188483012160651, 'bagging_freq': 3}. Best is trial 36 with value: 0.0035836660043942837.
[I 2025-11-07 14:22:46,544] Trial 49 finished with value: 0.10681747203086593 and parameters: {'num_leaves': 204, 'max_depth': 2, 'min_data_in_leaf': 216, 'num_iterations': 28, 'bagging_fraction': 0.8615774978483814, 'bagging_freq': 4}. Best is trial 36 with value: 0.0035836660043942837.
Number of finished trials: 50
Best trial: 0.0035836660043942837
num_leaves: 187
max_depth: 58
min_data_in_leaf: 24
num_iterations: 155
bagging_fraction: 0.7790472750239636
bagging_freq: 4
TUNING COMPLETE
[I 2025-11-07 14:22:46,714] Trial 0 finished with value: 2.217463987037395 and parameters: {'num_leaves': 38, 'max_depth': 177, 'min_data_in_leaf': 81, 'num_iterations': 94, 'bagging_fraction': 0.6071214413167514, 'bagging_freq': 2}. Best is trial 0 with value: 2.217463987037395.
[I 2025-11-07 14:22:46,735] Trial 1 finished with value: 2.5302756809068074 and parameters: {'num_leaves': 83, 'max_depth': 27, 'min_data_in_leaf': 177, 'num_iterations': 41, 'bagging_fraction': 0.9889998163277448, 'bagging_freq': 3}. Best is trial 0 with value: 2.217463987037395.
[I 2025-11-07 14:22:46,768] Trial 2 finished with value: 2.233297829437584 and parameters: {'num_leaves': 12, 'max_depth': 158, 'min_data_in_leaf': 93, 'num_iterations': 77, 'bagging_fraction': 0.6507690841429046, 'bagging_freq': 1}. Best is trial 0 with value: 2.217463987037395.
[I 2025-11-07 14:22:46,779] Trial 3 finished with value: 3.6325172590587522 and parameters: {'num_leaves': 82, 'max_depth': 230, 'min_data_in_leaf': 248, 'num_iterations': 30, 'bagging_fraction': 0.6575054481666343, 'bagging_freq': 5}. Best is trial 0 with value: 2.217463987037395.
[I 2025-11-07 14:22:46,915] Trial 4 finished with value: 1.9519395540436342 and parameters: {'num_leaves': 211, 'max_depth': 30, 'min_data_in_leaf': 22, 'num_iterations': 97, 'bagging_fraction': 0.7923256552747111, 'bagging_freq': 5}. Best is trial 4 with value: 1.9519395540436342.
[I 2025-11-07 14:22:46,994] Trial 5 finished with value: 2.037375936599948 and parameters: {'num_leaves': 21, 'max_depth': 44, 'min_data_in_leaf': 27, 'num_iterations': 149, 'bagging_fraction': 0.6462482222199595, 'bagging_freq': 1}. Best is trial 4 with value: 1.9519395540436342.
[I 2025-11-07 14:22:47,011] Trial 6 finished with value: 2.913697492362779 and parameters: {'num_leaves': 138, 'max_depth': 163, 'min_data_in_leaf': 204, 'num_iterations': 72, 'bagging_fraction': 0.5384431993550269, 'bagging_freq': 5}. Best is trial 4 with value: 1.9519395540436342.
[I 2025-11-07 14:22:47,032] Trial 7 finished with value: 2.5973745983469736 and parameters: {'num_leaves': 197, 'max_depth': 92, 'min_data_in_leaf': 177, 'num_iterations': 62, 'bagging_fraction': 0.7192646486876234, 'bagging_freq': 5}. Best is trial 4 with value: 1.9519395540436342.
[I 2025-11-07 14:22:47,060] Trial 8 finished with value: 2.1793693704233816 and parameters: {'num_leaves': 243, 'max_depth': 151, 'min_data_in_leaf': 64, 'num_iterations': 43, 'bagging_fraction': 0.6430830504379776, 'bagging_freq': 3}. Best is trial 4 with value: 1.9519395540436342.
[I 2025-11-07 14:22:47,110] Trial 9 finished with value: 1.872271198862132 and parameters: {'num_leaves': 31, 'max_depth': 210, 'min_data_in_leaf': 12, 'num_iterations': 48, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:47,220] Trial 10 finished with value: 2.125854736566008 and parameters: {'num_leaves': 132, 'max_depth': 252, 'min_data_in_leaf': 131, 'num_iterations': 200, 'bagging_fraction': 0.8533703418149945, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:47,583] Trial 11 finished with value: 1.9563295360623236 and parameters: {'num_leaves': 190, 'max_depth': 91, 'min_data_in_leaf': 11, 'num_iterations': 126, 'bagging_fraction': 0.8444973389242248, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:47,686] Trial 12 finished with value: 2.0210693633110064 and parameters: {'num_leaves': 256, 'max_depth': 206, 'min_data_in_leaf': 43, 'num_iterations': 116, 'bagging_fraction': 0.8057658259017626, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:47,747] Trial 13 finished with value: 2.165285008708338 and parameters: {'num_leaves': 185, 'max_depth': 6, 'min_data_in_leaf': 118, 'num_iterations': 144, 'bagging_fraction': 0.7710591745502531, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:47,836] Trial 14 finished with value: 1.9952545766123988 and parameters: {'num_leaves': 78, 'max_depth': 102, 'min_data_in_leaf': 48, 'num_iterations': 95, 'bagging_fraction': 0.9064532050051144, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,350] Trial 15 finished with value: 2.062143153061911 and parameters: {'num_leaves': 162, 'max_depth': 62, 'min_data_in_leaf': 10, 'num_iterations': 174, 'bagging_fraction': 0.7420356253486351, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,415] Trial 16 finished with value: 2.17217502729263 and parameters: {'num_leaves': 219, 'max_depth': 199, 'min_data_in_leaf': 105, 'num_iterations': 60, 'bagging_fraction': 0.9218035876065989, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,495] Trial 17 finished with value: 2.1579706435998407 and parameters: {'num_leaves': 53, 'max_depth': 127, 'min_data_in_leaf': 62, 'num_iterations': 95, 'bagging_fraction': 0.7029366669485296, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,520] Trial 18 finished with value: 2.90316442949979 and parameters: {'num_leaves': 109, 'max_depth': 126, 'min_data_in_leaf': 146, 'num_iterations': 27, 'bagging_fraction': 0.7977861740784461, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,686] Trial 19 finished with value: 1.9977606230640301 and parameters: {'num_leaves': 153, 'max_depth': 255, 'min_data_in_leaf': 32, 'num_iterations': 135, 'bagging_fraction': 0.8744088503511349, 'bagging_freq': 1}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:48,742] Trial 20 finished with value: 2.261410588456024 and parameters: {'num_leaves': 227, 'max_depth': 60, 'min_data_in_leaf': 73, 'num_iterations': 83, 'bagging_fraction': 0.5070012438647024, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,115] Trial 21 finished with value: 2.0183628788684 and parameters: {'num_leaves': 197, 'max_depth': 90, 'min_data_in_leaf': 10, 'num_iterations': 117, 'bagging_fraction': 0.8352224583866932, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,177] Trial 22 finished with value: 2.0074493721597415 and parameters: {'num_leaves': 178, 'max_depth': 4, 'min_data_in_leaf': 25, 'num_iterations': 123, 'bagging_fraction': 0.7742969417293077, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,367] Trial 23 finished with value: 2.0721488527054612 and parameters: {'num_leaves': 214, 'max_depth': 69, 'min_data_in_leaf': 49, 'num_iterations': 166, 'bagging_fraction': 0.9517339433107506, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,627] Trial 24 finished with value: 1.9803059100178555 and parameters: {'num_leaves': 109, 'max_depth': 27, 'min_data_in_leaf': 10, 'num_iterations': 105, 'bagging_fraction': 0.825031322982918, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,774] Trial 25 finished with value: 1.9241809364399243 and parameters: {'num_leaves': 164, 'max_depth': 109, 'min_data_in_leaf': 38, 'num_iterations': 134, 'bagging_fraction': 0.8821368807499915, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:49,998] Trial 26 finished with value: 2.042154116843384 and parameters: {'num_leaves': 163, 'max_depth': 195, 'min_data_in_leaf': 37, 'num_iterations': 151, 'bagging_fraction': 0.8970099585380352, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,047] Trial 27 finished with value: 2.1791748687274626 and parameters: {'num_leaves': 107, 'max_depth': 116, 'min_data_in_leaf': 56, 'num_iterations': 54, 'bagging_fraction': 0.7037757061890791, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,165] Trial 28 finished with value: 2.1543985025384313 and parameters: {'num_leaves': 149, 'max_depth': 145, 'min_data_in_leaf': 87, 'num_iterations': 161, 'bagging_fraction': 0.9486364266937175, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,232] Trial 29 finished with value: 2.150121549602934 and parameters: {'num_leaves': 55, 'max_depth': 183, 'min_data_in_leaf': 79, 'num_iterations': 104, 'bagging_fraction': 0.7408637637528088, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,373] Trial 30 finished with value: 2.0224462155221183 and parameters: {'num_leaves': 28, 'max_depth': 219, 'min_data_in_leaf': 24, 'num_iterations': 185, 'bagging_fraction': 0.5764387451635581, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,518] Trial 31 finished with value: 1.9699657035299674 and parameters: {'num_leaves': 183, 'max_depth': 74, 'min_data_in_leaf': 33, 'num_iterations': 130, 'bagging_fraction': 0.8740268777217409, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,765] Trial 32 finished with value: 2.0352331219100432 and parameters: {'num_leaves': 209, 'max_depth': 107, 'min_data_in_leaf': 19, 'num_iterations': 136, 'bagging_fraction': 0.7958608679341295, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:50,892] Trial 33 finished with value: 2.0094730110556194 and parameters: {'num_leaves': 234, 'max_depth': 37, 'min_data_in_leaf': 47, 'num_iterations': 107, 'bagging_fraction': 0.9877389765370235, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,005] Trial 34 finished with value: 2.138591450593486 and parameters: {'num_leaves': 201, 'max_depth': 88, 'min_data_in_leaf': 69, 'num_iterations': 127, 'bagging_fraction': 0.8524896132873623, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,063] Trial 35 finished with value: 2.214893071549789 and parameters: {'num_leaves': 171, 'max_depth': 174, 'min_data_in_leaf': 101, 'num_iterations': 84, 'bagging_fraction': 0.8187682405680394, 'bagging_freq': 1}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,129] Trial 36 finished with value: 2.0215456021974463 and parameters: {'num_leaves': 11, 'max_depth': 53, 'min_data_in_leaf': 20, 'num_iterations': 143, 'bagging_fraction': 0.7725922929820237, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,155] Trial 37 finished with value: 3.1700198157833253 and parameters: {'num_leaves': 192, 'max_depth': 135, 'min_data_in_leaf': 243, 'num_iterations': 45, 'bagging_fraction': 0.6703261086572104, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,186] Trial 38 finished with value: 2.660664659000483 and parameters: {'num_leaves': 125, 'max_depth': 21, 'min_data_in_leaf': 163, 'num_iterations': 35, 'bagging_fraction': 0.870679355468466, 'bagging_freq': 5}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,277] Trial 39 finished with value: 1.9367224404770074 and parameters: {'num_leaves': 243, 'max_depth': 76, 'min_data_in_leaf': 35, 'num_iterations': 71, 'bagging_fraction': 0.9367697755831724, 'bagging_freq': 4}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,358] Trial 40 finished with value: 1.8972582560225357 and parameters: {'num_leaves': 246, 'max_depth': 43, 'min_data_in_leaf': 36, 'num_iterations': 54, 'bagging_fraction': 0.9607267204239914, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,458] Trial 41 finished with value: 1.9381244966433602 and parameters: {'num_leaves': 252, 'max_depth': 39, 'min_data_in_leaf': 37, 'num_iterations': 72, 'bagging_fraction': 0.9497382014847314, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,534] Trial 42 finished with value: 2.0000126108471177 and parameters: {'num_leaves': 251, 'max_depth': 46, 'min_data_in_leaf': 55, 'num_iterations': 71, 'bagging_fraction': 0.9957057309768894, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,607] Trial 43 finished with value: 1.8944267968552124 and parameters: {'num_leaves': 236, 'max_depth': 78, 'min_data_in_leaf': 36, 'num_iterations': 51, 'bagging_fraction': 0.9612889096428789, 'bagging_freq': 3}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,672] Trial 44 finished with value: 1.8873534400753882 and parameters: {'num_leaves': 240, 'max_depth': 71, 'min_data_in_leaf': 41, 'num_iterations': 53, 'bagging_fraction': 0.925811820500564, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,724] Trial 45 finished with value: 2.006015586499559 and parameters: {'num_leaves': 233, 'max_depth': 107, 'min_data_in_leaf': 60, 'num_iterations': 50, 'bagging_fraction': 0.9762511478264861, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,764] Trial 46 finished with value: 2.195634699056772 and parameters: {'num_leaves': 223, 'max_depth': 82, 'min_data_in_leaf': 79, 'num_iterations': 37, 'bagging_fraction': 0.9649096526995171, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,801] Trial 47 finished with value: 2.5590992929405614 and parameters: {'num_leaves': 240, 'max_depth': 17, 'min_data_in_leaf': 207, 'num_iterations': 62, 'bagging_fraction': 0.9029781597102765, 'bagging_freq': 1}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,875] Trial 48 finished with value: 1.8961435777415487 and parameters: {'num_leaves': 203, 'max_depth': 100, 'min_data_in_leaf': 44, 'num_iterations': 60, 'bagging_fraction': 0.9228064675129292, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
[I 2025-11-07 14:22:51,927] Trial 49 finished with value: 2.1524838871044163 and parameters: {'num_leaves': 256, 'max_depth': 239, 'min_data_in_leaf': 94, 'num_iterations': 54, 'bagging_fraction': 0.9250060081103659, 'bagging_freq': 2}. Best is trial 9 with value: 1.872271198862132.
Number of finished trials: 50
Best trial: 1.872271198862132
num_leaves: 31
max_depth: 210
min_data_in_leaf: 12
num_iterations: 48
bagging_fraction: 0.764136724165049
bagging_freq: 2
TUNING COMPLETE
[I 2025-11-07 14:22:52,078] Trial 0 finished with value: 4.601963942696513 and parameters: {'num_leaves': 38, 'max_depth': 177, 'min_data_in_leaf': 81, 'num_iterations': 94, 'bagging_fraction': 0.6071214413167514, 'bagging_freq': 2}. Best is trial 0 with value: 4.601963942696513.
[I 2025-11-07 14:22:52,094] Trial 1 finished with value: 7.674971387201009 and parameters: {'num_leaves': 83, 'max_depth': 27, 'min_data_in_leaf': 177, 'num_iterations': 41, 'bagging_fraction': 0.9889998163277448, 'bagging_freq': 3}. Best is trial 0 with value: 4.601963942696513.
[I 2025-11-07 14:22:52,117] Trial 2 finished with value: 5.0838050157876475 and parameters: {'num_leaves': 12, 'max_depth': 158, 'min_data_in_leaf': 93, 'num_iterations': 77, 'bagging_fraction': 0.6507690841429046, 'bagging_freq': 1}. Best is trial 0 with value: 4.601963942696513.
[I 2025-11-07 14:22:52,125] Trial 3 finished with value: 11.037136186360756 and parameters: {'num_leaves': 82, 'max_depth': 230, 'min_data_in_leaf': 248, 'num_iterations': 30, 'bagging_fraction': 0.6575054481666343, 'bagging_freq': 5}. Best is trial 0 with value: 4.601963942696513.
[I 2025-11-07 14:22:52,218] Trial 4 finished with value: 4.015313676597538 and parameters: {'num_leaves': 211, 'max_depth': 30, 'min_data_in_leaf': 22, 'num_iterations': 97, 'bagging_fraction': 0.7923256552747111, 'bagging_freq': 5}. Best is trial 4 with value: 4.015313676597538.
[I 2025-11-07 14:22:52,307] Trial 5 finished with value: 3.592534838776857 and parameters: {'num_leaves': 21, 'max_depth': 44, 'min_data_in_leaf': 27, 'num_iterations': 149, 'bagging_fraction': 0.6462482222199595, 'bagging_freq': 1}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,329] Trial 6 finished with value: 7.36155073833501 and parameters: {'num_leaves': 138, 'max_depth': 163, 'min_data_in_leaf': 204, 'num_iterations': 72, 'bagging_fraction': 0.5384431993550269, 'bagging_freq': 5}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,355] Trial 7 finished with value: 6.59602832048508 and parameters: {'num_leaves': 197, 'max_depth': 92, 'min_data_in_leaf': 177, 'num_iterations': 62, 'bagging_fraction': 0.7192646486876234, 'bagging_freq': 5}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,382] Trial 8 finished with value: 8.14426534186587 and parameters: {'num_leaves': 243, 'max_depth': 151, 'min_data_in_leaf': 64, 'num_iterations': 43, 'bagging_fraction': 0.6430830504379776, 'bagging_freq': 3}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,430] Trial 9 finished with value: 7.348511969981209 and parameters: {'num_leaves': 31, 'max_depth': 210, 'min_data_in_leaf': 12, 'num_iterations': 48, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,505] Trial 10 finished with value: 3.628506880589847 and parameters: {'num_leaves': 135, 'max_depth': 86, 'min_data_in_leaf': 126, 'num_iterations': 160, 'bagging_fraction': 0.8819389978743449, 'bagging_freq': 1}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,578] Trial 11 finished with value: 3.65287366407973 and parameters: {'num_leaves': 149, 'max_depth': 91, 'min_data_in_leaf': 130, 'num_iterations': 162, 'bagging_fraction': 0.8846666306301009, 'bagging_freq': 1}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,643] Trial 12 finished with value: 3.6679562292607124 and parameters: {'num_leaves': 94, 'max_depth': 87, 'min_data_in_leaf': 122, 'num_iterations': 152, 'bagging_fraction': 0.8702897753840738, 'bagging_freq': 1}. Best is trial 5 with value: 3.592534838776857.
[I 2025-11-07 14:22:52,811] Trial 13 finished with value: 3.413346071954213 and parameters: {'num_leaves': 158, 'max_depth': 53, 'min_data_in_leaf': 51, 'num_iterations': 199, 'bagging_fraction': 0.9884252411626588, 'bagging_freq': 2}. Best is trial 13 with value: 3.413346071954213.
[I 2025-11-07 14:22:52,974] Trial 14 finished with value: 3.2774348166481557 and parameters: {'num_leaves': 177, 'max_depth': 11, 'min_data_in_leaf': 46, 'num_iterations': 199, 'bagging_fraction': 0.9989791438935978, 'bagging_freq': 2}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,084] Trial 15 finished with value: 3.4656981727765466 and parameters: {'num_leaves': 177, 'max_depth': 6, 'min_data_in_leaf': 55, 'num_iterations': 196, 'bagging_fraction': 0.9774320313234164, 'bagging_freq': 2}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,236] Trial 16 finished with value: 3.374440155115852 and parameters: {'num_leaves': 172, 'max_depth': 60, 'min_data_in_leaf': 48, 'num_iterations': 187, 'bagging_fraction': 0.9218035876065989, 'bagging_freq': 4}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,330] Trial 17 finished with value: 3.594549787430422 and parameters: {'num_leaves': 242, 'max_depth': 7, 'min_data_in_leaf': 96, 'num_iterations': 183, 'bagging_fraction': 0.9331396192083916, 'bagging_freq': 4}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,426] Trial 18 finished with value: 3.6102982327003392 and parameters: {'num_leaves': 109, 'max_depth': 117, 'min_data_in_leaf': 46, 'num_iterations': 127, 'bagging_fraction': 0.8348577478798135, 'bagging_freq': 4}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,537] Trial 19 finished with value: 3.537982130383315 and parameters: {'num_leaves': 208, 'max_depth': 59, 'min_data_in_leaf': 80, 'num_iterations': 177, 'bagging_fraction': 0.9243396098015331, 'bagging_freq': 4}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,595] Trial 20 finished with value: 3.947734551224533 and parameters: {'num_leaves': 176, 'max_depth': 116, 'min_data_in_leaf': 152, 'num_iterations': 130, 'bagging_fraction': 0.8232048850379108, 'bagging_freq': 3}. Best is trial 14 with value: 3.2774348166481557.
[I 2025-11-07 14:22:53,772] Trial 21 finished with value: 3.2683519784084916 and parameters: {'num_leaves': 169, 'max_depth': 51, 'min_data_in_leaf': 40, 'num_iterations': 196, 'bagging_fraction': 0.9479777779674912, 'bagging_freq': 2}. Best is trial 21 with value: 3.2683519784084916.
[I 2025-11-07 14:22:53,951] Trial 22 finished with value: 3.2018691316035346 and parameters: {'num_leaves': 173, 'max_depth': 68, 'min_data_in_leaf': 34, 'num_iterations': 178, 'bagging_fraction': 0.9393559380812958, 'bagging_freq': 3}. Best is trial 22 with value: 3.2018691316035346.
[I 2025-11-07 14:22:54,131] Trial 23 finished with value: 3.3516081280159304 and parameters: {'num_leaves': 222, 'max_depth': 27, 'min_data_in_leaf': 31, 'num_iterations': 173, 'bagging_fraction': 0.9492267918413101, 'bagging_freq': 2}. Best is trial 22 with value: 3.2018691316035346.
[I 2025-11-07 14:22:54,345] Trial 24 finished with value: 3.1617934353578363 and parameters: {'num_leaves': 190, 'max_depth': 71, 'min_data_in_leaf': 11, 'num_iterations': 135, 'bagging_fraction': 0.9536436683958104, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:54,552] Trial 25 finished with value: 3.3161083924208405 and parameters: {'num_leaves': 112, 'max_depth': 69, 'min_data_in_leaf': 12, 'num_iterations': 128, 'bagging_fraction': 0.8584945398036755, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:54,668] Trial 26 finished with value: 3.5908789322883305 and parameters: {'num_leaves': 197, 'max_depth': 132, 'min_data_in_leaf': 73, 'num_iterations': 144, 'bagging_fraction': 0.9040822649864813, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:54,766] Trial 27 finished with value: 3.5643712552991507 and parameters: {'num_leaves': 159, 'max_depth': 77, 'min_data_in_leaf': 104, 'num_iterations': 169, 'bagging_fraction': 0.9566263518249185, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:54,852] Trial 28 finished with value: 3.837770310729258 and parameters: {'num_leaves': 195, 'max_depth': 102, 'min_data_in_leaf': 35, 'num_iterations': 108, 'bagging_fraction': 0.7230344180948712, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,005] Trial 29 finished with value: 3.55323093924367 and parameters: {'num_leaves': 55, 'max_depth': 191, 'min_data_in_leaf': 10, 'num_iterations': 116, 'bagging_fraction': 0.8156528142378011, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,120] Trial 30 finished with value: 3.6383159778076886 and parameters: {'num_leaves': 231, 'max_depth': 136, 'min_data_in_leaf': 62, 'num_iterations': 186, 'bagging_fraction': 0.5085048733763315, 'bagging_freq': 4}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,335] Trial 31 finished with value: 3.236385358092919 and parameters: {'num_leaves': 181, 'max_depth': 41, 'min_data_in_leaf': 38, 'num_iterations': 198, 'bagging_fraction': 0.9595612382944431, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,522] Trial 32 finished with value: 3.197344368064135 and parameters: {'num_leaves': 160, 'max_depth': 43, 'min_data_in_leaf': 32, 'num_iterations': 188, 'bagging_fraction': 0.96016102646282, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,676] Trial 33 finished with value: 3.2330335527167384 and parameters: {'num_leaves': 127, 'max_depth': 36, 'min_data_in_leaf': 25, 'num_iterations': 140, 'bagging_fraction': 0.96957138574753, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,857] Trial 34 finished with value: 3.383218983138875 and parameters: {'num_leaves': 113, 'max_depth': 36, 'min_data_in_leaf': 24, 'num_iterations': 139, 'bagging_fraction': 0.9111316922474577, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:55,974] Trial 35 finished with value: 3.5360153781416783 and parameters: {'num_leaves': 124, 'max_depth': 22, 'min_data_in_leaf': 71, 'num_iterations': 158, 'bagging_fraction': 0.9758189505549886, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,099] Trial 36 finished with value: 3.860853919238995 and parameters: {'num_leaves': 90, 'max_depth': 76, 'min_data_in_leaf': 18, 'num_iterations': 95, 'bagging_fraction': 0.8987058862856597, 'bagging_freq': 4}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,145] Trial 37 finished with value: 4.777407751629921 and parameters: {'num_leaves': 142, 'max_depth': 65, 'min_data_in_leaf': 243, 'num_iterations': 116, 'bagging_fraction': 0.843068622221608, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,227] Trial 38 finished with value: 3.798355169434281 and parameters: {'num_leaves': 125, 'max_depth': 21, 'min_data_in_leaf': 88, 'num_iterations': 105, 'bagging_fraction': 0.9992150136053723, 'bagging_freq': 4}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,351] Trial 39 finished with value: 3.4023711355380453 and parameters: {'num_leaves': 69, 'max_depth': 104, 'min_data_in_leaf': 30, 'num_iterations': 137, 'bagging_fraction': 0.7851840357150944, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,426] Trial 40 finished with value: 4.646240381230011 and parameters: {'num_leaves': 156, 'max_depth': 43, 'min_data_in_leaf': 24, 'num_iterations': 86, 'bagging_fraction': 0.5856626911744915, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,652] Trial 41 finished with value: 3.3012676281296196 and parameters: {'num_leaves': 190, 'max_depth': 39, 'min_data_in_leaf': 36, 'num_iterations': 180, 'bagging_fraction': 0.9636544395522539, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:56,812] Trial 42 finished with value: 3.4398367354304527 and parameters: {'num_leaves': 213, 'max_depth': 46, 'min_data_in_leaf': 65, 'num_iterations': 167, 'bagging_fraction': 0.9398023522140762, 'bagging_freq': 2}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:57,006] Trial 43 finished with value: 3.3630990000935057 and parameters: {'num_leaves': 184, 'max_depth': 256, 'min_data_in_leaf': 38, 'num_iterations': 190, 'bagging_fraction': 0.9635296525384189, 'bagging_freq': 1}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:57,257] Trial 44 finished with value: 3.21035367241177 and parameters: {'num_leaves': 255, 'max_depth': 31, 'min_data_in_leaf': 19, 'num_iterations': 149, 'bagging_fraction': 0.8892913156255134, 'bagging_freq': 3}. Best is trial 24 with value: 3.1617934353578363.
[I 2025-11-07 14:22:57,467] Trial 45 finished with value: 3.1612476860837733 and parameters: {'num_leaves': 256, 'max_depth': 20, 'min_data_in_leaf': 22, 'num_iterations': 152, 'bagging_fraction': 0.887433232987156, 'bagging_freq': 3}. Best is trial 45 with value: 3.1612476860837733.
[I 2025-11-07 14:22:57,704] Trial 46 finished with value: 3.155966581928284 and parameters: {'num_leaves': 254, 'max_depth': 15, 'min_data_in_leaf': 17, 'num_iterations': 154, 'bagging_fraction': 0.8850924133798566, 'bagging_freq': 3}. Best is trial 46 with value: 3.155966581928284.
[I 2025-11-07 14:22:58,024] Trial 47 finished with value: 3.1189111554857276 and parameters: {'num_leaves': 255, 'max_depth': 18, 'min_data_in_leaf': 10, 'num_iterations': 157, 'bagging_fraction': 0.8765313708349454, 'bagging_freq': 3}. Best is trial 47 with value: 3.1189111554857276.
[I 2025-11-07 14:22:58,309] Trial 48 finished with value: 3.0582936230499485 and parameters: {'num_leaves': 254, 'max_depth': 16, 'min_data_in_leaf': 10, 'num_iterations': 155, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3}. Best is trial 48 with value: 3.0582936230499485.
[I 2025-11-07 14:22:58,352] Trial 49 finished with value: 5.166806476621995 and parameters: {'num_leaves': 256, 'max_depth': 2, 'min_data_in_leaf': 155, 'num_iterations': 154, 'bagging_fraction': 0.8567507837291886, 'bagging_freq': 3}. Best is trial 48 with value: 3.0582936230499485.
Number of finished trials: 50
Best trial: 3.0582936230499485
num_leaves: 254
max_depth: 16
min_data_in_leaf: 10
num_iterations: 155
bagging_fraction: 0.8606968457336602
bagging_freq: 3
TUNING COMPLETE
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/1.srmi.implicate
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/2.srmi.implicate
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'binary', 'num_leaves': 187, 'min_data_in_leaf': 24, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 58, 'bagging_fraction': 0.7790472750239636, 'bagging_freq': 4, 'seed': 2710375200}
Iterations: 155
Model: var_gbm1=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, bbweight__1)
Categorical features: ['var5']
┌─────────────┬─────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var5 ┆ 0.4356 ┆ 0.02323 ┆ 0 ┆ 0.4975 ┆ 0 ┆ 0.507 │ │ var3 ┆ 0.1919 ┆ 0.1358 ┆ 0 ┆ 25.13 ┆ 0 ┆ 25.05 │ │ unrelated_5 ┆ 0.06482 ┆ 0.1391 ┆ 0 ┆ 0.4966 ┆ 0 ┆ 0.5053 │ │ unrelated_2 ┆ 0.06479 ┆ 0.1421 ┆ 0 ┆ 0.4995 ┆ 0 ┆ 0.5019 │ │ unrelated_3 ┆ 0.0629 ┆ 0.1432 ┆ 0 ┆ 0.5 ┆ 0 ┆ 0.4968 │ │ unrelated_1 ┆ 0.06186 ┆ 0.1388 ┆ 0 ┆ 0.5028 ┆ 0 ┆ 0.5015 │ │ var4 ┆ 0.06038 ┆ 0.141 ┆ 0 ┆ 0.5041 ┆ 0 ┆ 0.5103 │ │ unrelated_4 ┆ 0.05779 ┆ 0.1367 ┆ 0 ┆ 0.5007 ┆ 0 ┆ 0.5004 │ └─────────────┴─────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 7517.0 ┆ 7517.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.526008 ┆ 0.533659 ┆ 0.548444 │ │ std ┆ null ┆ 0.485445 ┆ 0.470057 │ │ min ┆ 0.0 ┆ 0.000006 ┆ 0.000006 │ │ 25% ┆ null ┆ 0.000337 ┆ 0.000408 │ │ 50% ┆ null ┆ 0.948241 ┆ 0.889254 │ │ 75% ┆ null ┆ 0.996801 ┆ 0.993652 │ │ max ┆ 1.0 ┆ 0.999978 ┆ 0.999986 │ └────────────┴──────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 7517.0 ┆ 7517.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.526008 ┆ 0.533659 ┆ 0.548444 │ │ std ┆ null ┆ 0.485445 ┆ 0.470057 │ │ min ┆ 0.0 ┆ 0.000006 ┆ 0.000006 │ │ 25% ┆ null ┆ 0.000337 ┆ 0.000408 │ │ 50% ┆ null ┆ 0.948241 ┆ 0.889254 │ │ 75% ┆ null ┆ 0.996801 ┆ 0.993652 │ │ max ┆ 1.0 ┆ 0.999978 ┆ 0.999986 │ └────────────┴──────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm1']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 9014 ┆ 5 │ │ 854 ┆ 4 │ │ 1130 ┆ 4 │ │ 1249 ┆ 4 │ │ 2886 ┆ 4 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm1']
Where: None
Where (impute): col(___imp_missing_var_gbm1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm1 ┆ ┆ 10000 ┆ 10000 ┆ 0.5294 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 0 ┆ 7517 ┆ 7517 ┆ 0.526 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 1 ┆ 2483 ┆ 2483 ┆ 0.5397 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 31, 'min_data_in_leaf': 12, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 210, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2, 'seed': 19016137}
Iterations: 48
Model: var_gbm2=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, bbweight__1)
Categorical features: ['var5']
┌─────────────┬──────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var4 ┆ 0.4531 ┆ 0.2083 ┆ 0 ┆ 0.5031 ┆ 0 ┆ 0.5037 │ │ var3 ┆ 0.4087 ┆ 0.1347 ┆ 0 ┆ 25.38 ┆ 0 ┆ 25.37 │ │ unrelated_3 ┆ 0.03061 ┆ 0.1444 ┆ 0 ┆ 0.4989 ┆ 0 ┆ 0.4942 │ │ unrelated_5 ┆ 0.02796 ┆ 0.1306 ┆ 0 ┆ 0.4989 ┆ 0 ┆ 0.4905 │ │ unrelated_2 ┆ 0.02752 ┆ 0.1403 ┆ 0 ┆ 0.4988 ┆ 0 ┆ 0.4977 │ │ unrelated_4 ┆ 0.02264 ┆ 0.1076 ┆ 0 ┆ 0.4983 ┆ 0 ┆ 0.5033 │ │ unrelated_1 ┆ 0.02167 ┆ 0.1153 ┆ 0 ┆ 0.5036 ┆ 0 ┆ 0.497 │ │ var5 ┆ 0.007771 ┆ 0.01875 ┆ 0 ┆ 0.1407 ┆ 0 ┆ 0.1365 │ └─────────────┴──────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 3844.0 ┆ 3844.0 ┆ 1319.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.598003 ┆ -4.548508 ┆ -4.365477 │ │ std ┆ 13.949589 ┆ 12.049183 ┆ 11.737758 │ │ min ┆ -55.354108 ┆ -42.065869 ┆ -39.686962 │ │ 25% ┆ -13.017682 ┆ -11.873534 ┆ -11.570185 │ │ 50% ┆ -2.211656 ┆ -1.324993 ┆ -1.042669 │ │ 75% ┆ 5.929831 ┆ 5.201742 ┆ 5.107091 │ │ max ┆ 26.213084 ┆ 16.866917 ┆ 12.539062 │ └────────────┴────────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 3844.0 ┆ 3844.0 ┆ 1319.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.598003 ┆ -4.548508 ┆ -4.365477 │ │ std ┆ 13.949589 ┆ 12.049183 ┆ 11.737758 │ │ min ┆ -55.354108 ┆ -42.065869 ┆ -39.686962 │ │ 25% ┆ -13.017682 ┆ -11.873534 ┆ -11.570185 │ │ 50% ┆ -2.211656 ┆ -1.324993 ┆ -1.042669 │ │ 75% ┆ 5.929831 ┆ 5.201742 ┆ 5.107091 │ │ max ┆ 26.213084 ┆ 16.866917 ┆ 12.539062 │ └────────────┴────────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm2']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 808 ┆ 4 │ │ 327 ┆ 3 │ │ 411 ┆ 3 │ │ 466 ┆ 3 │ │ 737 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm2']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm2 ┆ ┆ 5163 ┆ 5163 ┆ -4.635 ┆ -4.635 ┆ 13.9 ┆ -25.46 ┆ -12.95 ┆ -2.221 ┆ 5.906 ┆ 11.3 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 0 ┆ 3844 ┆ 3844 ┆ -4.598 ┆ -4.598 ┆ 13.95 ┆ -25.21 ┆ -13.03 ┆ -2.219 ┆ 5.93 ┆ 11.52 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 1 ┆ 1319 ┆ 1319 ┆ -4.743 ┆ -4.743 ┆ 13.77 ┆ -26.55 ┆ -12.69 ┆ -2.263 ┆ 5.733 ┆ 10.51 ┆ -46.63 ┆ 23.67 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm2), lit(value=0, dtype=None)).alias(name=var_gbm2)
Calling recalculate_interaction
Calling square_var
Imputation using LightGBM
Running LightGBM for q=0.25
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.25, 'seed': 3256828818}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.25 prediction: 0.959
Running LightGBM for q=0.5
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.5, 'seed': 203458027}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.5 prediction: 0.962
Running LightGBM for q=0.75
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.75, 'seed': 1500068702}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.75 prediction: 0.961
┌──────────┬─────────┬───────┬─────────────┬────────┬───────┬────────┬──────────┬───────┐ │ Variable ┆ Sample ┆ n ┆ n (missing) ┆ mean ┆ std ┆ q25 ┆ q50 ┆ q75 │ ╞══════════╪═════════╪═══════╪═════════════╪════════╪═══════╪════════╪══════════╪═══════╡ │ p0.25 ┆ Model ┆ 3,800 ┆ 0 ┆ -6.434 ┆ 13.34 ┆ -14.23 ┆ -3.94 ┆ 3.876 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -5.983 ┆ 12.78 ┆ -12.65 ┆ -3.22 ┆ 3.92 │ │ p0.5 ┆ Model ┆ 3,800 ┆ 0 ┆ -5.104 ┆ 13.44 ┆ -12.74 ┆ -2.629 ┆ 5.106 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -4.48 ┆ 12.96 ┆ -11.6 ┆ -1.982 ┆ 5.221 │ │ p0.75 ┆ Model ┆ 3,800 ┆ 0 ┆ -3.664 ┆ 13.29 ┆ -11.88 ┆ -0.8593 ┆ 6.54 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -2.974 ┆ 12.71 ┆ -10.17 ┆ -0.09961 ┆ 6.679 │ └──────────┴─────────┴───────┴─────────────┴────────┴───────┴────────┴──────────┴───────┘
Running LightGBM for the mean for estimating the marginal distribution
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'seed': 3144578461}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm2 ┆ 0.8891 ┆ 0.1475 ┆ 0 ┆ -4.829 ┆ 0 ┆ -4.08 │ │ var4 ┆ 0.02208 ┆ 0.1272 ┆ 0 ┆ 0.5044 ┆ 0 ┆ 0.5005 │ │ var3 ┆ 0.01669 ┆ 0.08885 ┆ 0 ┆ 25.36 ┆ 0 ┆ 25.6 │ │ unrelated_5 ┆ 0.01528 ┆ 0.1348 ┆ 0 ┆ 0.4925 ┆ 0 ┆ 0.5069 │ │ unrelated_1 ┆ 0.01501 ┆ 0.1264 ┆ 0 ┆ 0.4961 ┆ 0 ┆ 0.5189 │ │ unrelated_2 ┆ 0.01466 ┆ 0.1254 ┆ 0 ┆ 0.4966 ┆ 0 ┆ 0.5061 │ │ unrelated_4 ┆ 0.01358 ┆ 0.1208 ┆ 0 ┆ 0.5008 ┆ 0 ┆ 0.4959 │ │ unrelated_3 ┆ 0.0129 ┆ 0.1238 ┆ 0 ┆ 0.4965 ┆ 0 ┆ 0.4979 │ │ var5 ┆ 0.0006699 ┆ 0.005122 ┆ 0 ┆ 0.1387 ┆ 0 ┆ 0.1402 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 3) ┌────────────┬──────────────┬────────────────┐ │ statistic ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 │ ╞════════════╪══════════════╪════════════════╡ │ count ┆ 3778.0 ┆ 1384.0 │ │ null_count ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.822414 ┆ -4.257832 │ │ std ┆ 13.557975 ┆ 12.972721 │ │ min ┆ -54.107871 ┆ -48.30372 │ │ 25% ┆ -12.920288 ┆ -11.211753 │ │ 50% ┆ -2.460296 ┆ -1.642772 │ │ 75% ┆ 5.501908 ┆ 5.534492 │ │ max ┆ 23.543141 ┆ 21.098808 │ └────────────┴──────────────┴────────────────┘
Correlation between var_gbm3 and prediction: 0.981
Finding 10 nearest neighbors on ['___yhat']
Randomly picking one and donating ['var_gbm3']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 6102 ┆ 4 │ │ 7840 ┆ 4 │ │ 9364 ┆ 4 │ │ 80 ┆ 3 │ │ 802 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm3']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm3_3)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm3 ┆ ┆ 5162 ┆ 5162 ┆ -4.602 ┆ -4.602 ┆ 13.75 ┆ -24.62 ┆ -12.84 ┆ -2.219 ┆ 5.852 ┆ 10.99 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 0 ┆ 3778 ┆ 3778 ┆ -4.77 ┆ -4.77 ┆ 13.88 ┆ -24.9 ┆ -13.44 ┆ -2.526 ┆ 5.779 ┆ 11.23 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 1 ┆ 1384 ┆ 1384 ┆ -4.143 ┆ -4.143 ┆ 13.38 ┆ -23.68 ┆ -11.55 ┆ -1.71 ┆ 6.082 ┆ 10.71 ┆ -45.64 ┆ 24.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm3), lit(value=0, dtype=None)).alias(name=var_gbm3)
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/1.srmi.implicate
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'binary', 'num_leaves': 187, 'min_data_in_leaf': 24, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 58, 'bagging_fraction': 0.7790472750239636, 'bagging_freq': 4, 'seed': 1388159058}
Iterations: 155
Model: var_gbm1=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm2, var_gbm3, bbweight__1)
Categorical features: ['var5']
┌─────────────┬──────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var5 ┆ 0.5335 ┆ 0.03108 ┆ 0 ┆ 0.4999 ┆ 0 ┆ 0.507 │ │ var_gbm3 ┆ 0.2604 ┆ 0.04444 ┆ 0 ┆ -2.376 ┆ 0 ┆ -2.414 │ │ var_gbm2 ┆ 0.1372 ┆ 0.03958 ┆ 0 ┆ -2.393 ┆ 0 ┆ -2.366 │ │ var3 ┆ 0.01409 ┆ 0.156 ┆ 0 ┆ 25.11 ┆ 0 ┆ 25.05 │ │ unrelated_2 ┆ 0.01153 ┆ 0.1201 ┆ 0 ┆ 0.5001 ┆ 0 ┆ 0.5019 │ │ unrelated_5 ┆ 0.01073 ┆ 0.1331 ┆ 0 ┆ 0.4988 ┆ 0 ┆ 0.5053 │ │ unrelated_3 ┆ 0.009334 ┆ 0.1143 ┆ 0 ┆ 0.4992 ┆ 0 ┆ 0.4968 │ │ var4 ┆ 0.008329 ┆ 0.1343 ┆ 0 ┆ 0.5057 ┆ 0 ┆ 0.5103 │ │ unrelated_1 ┆ 0.008179 ┆ 0.1191 ┆ 0 ┆ 0.5024 ┆ 0 ┆ 0.5015 │ │ unrelated_4 ┆ 0.006732 ┆ 0.108 ┆ 0 ┆ 0.5007 ┆ 0 ┆ 0.5004 │ └─────────────┴──────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 10000.0 ┆ 10000.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.5294 ┆ 0.527247 ┆ 0.529894 │ │ std ┆ null ┆ 0.498855 ┆ 0.498378 │ │ min ┆ 0.0 ┆ 1.7512e-7 ┆ 2.2687e-7 │ │ 25% ┆ null ┆ 0.000005 ┆ 0.000004 │ │ 50% ┆ null ┆ 0.999964 ┆ 0.99829 │ │ 75% ┆ null ┆ 0.999997 ┆ 0.999996 │ │ max ┆ 1.0 ┆ 1.0 ┆ 1.0 │ └────────────┴──────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 10000.0 ┆ 10000.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.5294 ┆ 0.527247 ┆ 0.529894 │ │ std ┆ null ┆ 0.498855 ┆ 0.498378 │ │ min ┆ 0.0 ┆ 1.7512e-7 ┆ 2.2687e-7 │ │ 25% ┆ null ┆ 0.000005 ┆ 0.000004 │ │ 50% ┆ null ┆ 0.999964 ┆ 0.99829 │ │ 75% ┆ null ┆ 0.999997 ┆ 0.999996 │ │ max ┆ 1.0 ┆ 1.0 ┆ 1.0 │ └────────────┴──────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm1']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 749 ┆ 4 │ │ 1792 ┆ 4 │ │ 1941 ┆ 4 │ │ 6183 ┆ 4 │ │ 217 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm1']
Where: None
Where (impute): col(___imp_missing_var_gbm1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm1 ┆ ┆ 12483 ┆ 12483 ┆ 0.5297 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 0 ┆ 10000 ┆ 10000 ┆ 0.5294 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 1 ┆ 2483 ┆ 2483 ┆ 0.5308 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 31, 'min_data_in_leaf': 12, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 210, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2, 'seed': 943250908}
Iterations: 48
Model: var_gbm2=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm3, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm3 ┆ 0.9248 ┆ 0.2465 ┆ 0 ┆ -4.587 ┆ 0 ┆ -4.558 │ │ var4 ┆ 0.01867 ┆ 0.1542 ┆ 0 ┆ 0.5032 ┆ 0 ┆ 0.5037 │ │ var3 ┆ 0.01514 ┆ 0.1174 ┆ 0 ┆ 25.38 ┆ 0 ┆ 25.37 │ │ unrelated_5 ┆ 0.0101 ┆ 0.1215 ┆ 0 ┆ 0.4968 ┆ 0 ┆ 0.4905 │ │ unrelated_1 ┆ 0.008501 ┆ 0.08889 ┆ 0 ┆ 0.5019 ┆ 0 ┆ 0.497 │ │ unrelated_4 ┆ 0.00764 ┆ 0.08958 ┆ 0 ┆ 0.4996 ┆ 0 ┆ 0.5033 │ │ unrelated_3 ┆ 0.007458 ┆ 0.09514 ┆ 0 ┆ 0.4977 ┆ 0 ┆ 0.4942 │ │ unrelated_2 ┆ 0.007098 ┆ 0.08125 ┆ 0 ┆ 0.4985 ┆ 0 ┆ 0.4977 │ │ var5 ┆ 0.0006348 ┆ 0.005556 ┆ 0 ┆ 0.1396 ┆ 0 ┆ 0.1365 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 5163.0 ┆ 5163.0 ┆ 1319.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.635061 ┆ -4.667456 ┆ -4.608837 │ │ std ┆ 13.901884 ┆ 13.233525 ┆ 12.84553 │ │ min ┆ -55.354108 ┆ -46.854539 ┆ -43.967428 │ │ 25% ┆ -12.938026 ┆ -12.427199 ┆ -11.796807 │ │ 50% ┆ -2.221402 ┆ -1.84763 ┆ -1.510974 │ │ 75% ┆ 5.905917 ┆ 5.483734 ┆ 5.310864 │ │ max ┆ 26.213084 ┆ 18.01205 ┆ 17.331948 │ └────────────┴────────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 5163.0 ┆ 5163.0 ┆ 1319.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.635061 ┆ -4.667456 ┆ -4.608837 │ │ std ┆ 13.901884 ┆ 13.233525 ┆ 12.84553 │ │ min ┆ -55.354108 ┆ -46.854539 ┆ -43.967428 │ │ 25% ┆ -12.938026 ┆ -12.427199 ┆ -11.796807 │ │ 50% ┆ -2.221402 ┆ -1.84763 ┆ -1.510974 │ │ 75% ┆ 5.905917 ┆ 5.483734 ┆ 5.310864 │ │ max ┆ 26.213084 ┆ 18.01205 ┆ 17.331948 │ └────────────┴────────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm2']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 3599 ┆ 4 │ │ 5631 ┆ 4 │ │ 3716 ┆ 3 │ │ 4446 ┆ 3 │ │ 5865 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm2']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm2 ┆ ┆ 6482 ┆ 6482 ┆ -4.677 ┆ -4.677 ┆ 13.81 ┆ -25.41 ┆ -12.84 ┆ -2.221 ┆ 5.762 ┆ 10.99 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 0 ┆ 5163 ┆ 5163 ┆ -4.635 ┆ -4.635 ┆ 13.9 ┆ -25.46 ┆ -12.95 ┆ -2.221 ┆ 5.906 ┆ 11.3 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 1 ┆ 1319 ┆ 1319 ┆ -4.841 ┆ -4.841 ┆ 13.44 ┆ -25.37 ┆ -12.55 ┆ -2.221 ┆ 5.511 ┆ 9.916 ┆ -47.25 ┆ 20.75 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm2), lit(value=0, dtype=None)).alias(name=var_gbm2)
Calling recalculate_interaction
Calling square_var
Imputation using LightGBM
Running LightGBM for q=0.25
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.25, 'seed': 4141536487}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.25 prediction: 0.977
Running LightGBM for q=0.5
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.5, 'seed': 3220936329}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.5 prediction: 0.979
Running LightGBM for q=0.75
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.75, 'seed': 35649414}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.75 prediction: 0.978
┌──────────┬─────────┬───────┬─────────────┬────────┬───────┬────────┬─────────┬───────┐ │ Variable ┆ Sample ┆ n ┆ n (missing) ┆ mean ┆ std ┆ q25 ┆ q50 ┆ q75 │ ╞══════════╪═════════╪═══════╪═════════════╪════════╪═══════╪════════╪═════════╪═══════╡ │ p0.25 ┆ Model ┆ 5,200 ┆ 0 ┆ -5.747 ┆ 13.53 ┆ -13.58 ┆ -3.567 ┆ 4.682 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -5.418 ┆ 13.33 ┆ -12.65 ┆ -3.018 ┆ 4.832 │ │ p0.5 ┆ Model ┆ 5,200 ┆ 0 ┆ -4.783 ┆ 13.49 ┆ -12.66 ┆ -2.386 ┆ 5.44 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -4.336 ┆ 13.25 ┆ -11.96 ┆ -1.808 ┆ 5.737 │ │ p0.75 ┆ Model ┆ 5,200 ┆ 0 ┆ -3.878 ┆ 13.31 ┆ -11.92 ┆ -0.9881 ┆ 6.064 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -3.211 ┆ 13.07 ┆ -10.48 ┆ -0.3465 ┆ 6.503 │ └──────────┴─────────┴───────┴─────────────┴────────┴───────┴────────┴─────────┴───────┘
Running LightGBM for the mean for estimating the marginal distribution
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'seed': 2120542166}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm2 ┆ 0.9469 ┆ 0.1449 ┆ 0 ┆ -4.653 ┆ 0 ┆ -4.132 │ │ var4 ┆ 0.009902 ┆ 0.1282 ┆ 0 ┆ 0.5033 ┆ 0 ┆ 0.5005 │ │ unrelated_1 ┆ 0.008325 ┆ 0.124 ┆ 0 ┆ 0.5022 ┆ 0 ┆ 0.5189 │ │ unrelated_5 ┆ 0.008096 ┆ 0.1418 ┆ 0 ┆ 0.4964 ┆ 0 ┆ 0.5069 │ │ var3 ┆ 0.007347 ┆ 0.09138 ┆ 0 ┆ 25.42 ┆ 0 ┆ 25.6 │ │ unrelated_2 ┆ 0.007093 ┆ 0.1239 ┆ 0 ┆ 0.4992 ┆ 0 ┆ 0.5061 │ │ unrelated_4 ┆ 0.006022 ┆ 0.1195 ┆ 0 ┆ 0.4995 ┆ 0 ┆ 0.4959 │ │ unrelated_3 ┆ 0.005947 ┆ 0.1204 ┆ 0 ┆ 0.4969 ┆ 0 ┆ 0.4979 │ │ var5 ┆ 0.0004014 ┆ 0.006033 ┆ 0 ┆ 0.1391 ┆ 0 ┆ 0.1402 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 3) ┌────────────┬──────────────┬────────────────┐ │ statistic ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 │ ╞════════════╪══════════════╪════════════════╡ │ count ┆ 5162.0 ┆ 1384.0 │ │ null_count ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.6048 ┆ -4.127228 │ │ std ┆ 13.564561 ┆ 13.245154 │ │ min ┆ -54.731083 ┆ -46.133707 │ │ 25% ┆ -12.766623 ┆ -11.574559 │ │ 50% ┆ -2.232471 ┆ -1.510268 │ │ 75% ┆ 5.6369 ┆ 5.761787 │ │ max ┆ 24.119096 ┆ 20.067176 │ └────────────┴──────────────┴────────────────┘
Correlation between var_gbm3 and prediction: 0.990
Finding 10 nearest neighbors on ['___yhat']
Randomly picking one and donating ['var_gbm3']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 9681 ┆ 4 │ │ 92 ┆ 3 │ │ 907 ┆ 3 │ │ 1822 ┆ 3 │ │ 1971 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm3']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm3_3)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm3 ┆ ┆ 6546 ┆ 6546 ┆ -4.485 ┆ -4.485 ┆ 13.68 ┆ -24.34 ┆ -12.43 ┆ -2.089 ┆ 5.906 ┆ 11.01 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 0 ┆ 5162 ┆ 5162 ┆ -4.602 ┆ -4.602 ┆ 13.75 ┆ -24.62 ┆ -12.84 ┆ -2.221 ┆ 5.852 ┆ 10.99 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 1 ┆ 1384 ┆ 1384 ┆ -4.048 ┆ -4.048 ┆ 13.42 ┆ -23.75 ┆ -11.55 ┆ -1.684 ┆ 6.168 ┆ 11.04 ┆ -46.63 ┆ 24.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm3), lit(value=0, dtype=None)).alias(name=var_gbm3)
var_gbm1
var_gbm2
var_gbm3
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/1.srmi.implicate
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'binary', 'num_leaves': 187, 'min_data_in_leaf': 24, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 58, 'bagging_fraction': 0.7790472750239636, 'bagging_freq': 4, 'seed': 1205842559}
Iterations: 155
Model: var_gbm1=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, bbweight__1)
Categorical features: ['var5']
┌─────────────┬─────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var5 ┆ 0.4517 ┆ 0.02617 ┆ 0 ┆ 0.4975 ┆ 0 ┆ 0.507 │ │ var3 ┆ 0.1814 ┆ 0.1392 ┆ 0 ┆ 25.13 ┆ 0 ┆ 25.05 │ │ unrelated_3 ┆ 0.06485 ┆ 0.1469 ┆ 0 ┆ 0.5 ┆ 0 ┆ 0.4968 │ │ unrelated_2 ┆ 0.06471 ┆ 0.1409 ┆ 0 ┆ 0.4995 ┆ 0 ┆ 0.5019 │ │ unrelated_5 ┆ 0.06199 ┆ 0.1321 ┆ 0 ┆ 0.4966 ┆ 0 ┆ 0.5053 │ │ unrelated_1 ┆ 0.05879 ┆ 0.1383 ┆ 0 ┆ 0.5028 ┆ 0 ┆ 0.5015 │ │ var4 ┆ 0.05857 ┆ 0.1366 ┆ 0 ┆ 0.5041 ┆ 0 ┆ 0.5103 │ │ unrelated_4 ┆ 0.05799 ┆ 0.1396 ┆ 0 ┆ 0.5007 ┆ 0 ┆ 0.5004 │ └─────────────┴─────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 7517.0 ┆ 7517.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.526008 ┆ 0.537339 ┆ 0.554951 │ │ std ┆ null ┆ 0.485952 ┆ 0.469908 │ │ min ┆ 0.0 ┆ 0.000003 ┆ 0.000003 │ │ 25% ┆ null ┆ 0.000291 ┆ 0.000379 │ │ 50% ┆ null ┆ 0.957086 ┆ 0.905536 │ │ 75% ┆ null ┆ 0.997146 ┆ 0.993723 │ │ max ┆ 1.0 ┆ 0.999974 ┆ 0.999936 │ └────────────┴──────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 7517.0 ┆ 7517.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.526008 ┆ 0.537339 ┆ 0.554951 │ │ std ┆ null ┆ 0.485952 ┆ 0.469908 │ │ min ┆ 0.0 ┆ 0.000003 ┆ 0.000003 │ │ 25% ┆ null ┆ 0.000291 ┆ 0.000379 │ │ 50% ┆ null ┆ 0.957086 ┆ 0.905536 │ │ 75% ┆ null ┆ 0.997146 ┆ 0.993723 │ │ max ┆ 1.0 ┆ 0.999974 ┆ 0.999936 │ └────────────┴──────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm1']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 1940 ┆ 6 │ │ 5684 ┆ 6 │ │ 2878 ┆ 5 │ │ 3300 ┆ 5 │ │ 306 ┆ 4 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm1']
Where: None
Where (impute): col(___imp_missing_var_gbm1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm1 ┆ ┆ 10000 ┆ 10000 ┆ 0.5298 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 0 ┆ 7517 ┆ 7517 ┆ 0.526 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 1 ┆ 2483 ┆ 2483 ┆ 0.5413 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 31, 'min_data_in_leaf': 12, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 210, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2, 'seed': 3440548605}
Iterations: 48
Model: var_gbm2=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, bbweight__1)
Categorical features: ['var5']
┌─────────────┬──────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var4 ┆ 0.461 ┆ 0.2236 ┆ 0 ┆ 0.5031 ┆ 0 ┆ 0.5026 │ │ var3 ┆ 0.4122 ┆ 0.1438 ┆ 0 ┆ 25.3 ┆ 0 ┆ 25.31 │ │ unrelated_3 ┆ 0.02671 ┆ 0.1236 ┆ 0 ┆ 0.4974 ┆ 0 ┆ 0.4941 │ │ unrelated_5 ┆ 0.02668 ┆ 0.1417 ┆ 0 ┆ 0.4997 ┆ 0 ┆ 0.4893 │ │ unrelated_4 ┆ 0.0241 ┆ 0.1167 ┆ 0 ┆ 0.4979 ┆ 0 ┆ 0.5031 │ │ unrelated_2 ┆ 0.02234 ┆ 0.1153 ┆ 0 ┆ 0.4984 ┆ 0 ┆ 0.5012 │ │ unrelated_1 ┆ 0.02132 ┆ 0.1167 ┆ 0 ┆ 0.5039 ┆ 0 ┆ 0.4965 │ │ var5 ┆ 0.005669 ┆ 0.01875 ┆ 0 ┆ 0.1392 ┆ 0 ┆ 0.1377 │ └─────────────┴──────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 3830.0 ┆ 3830.0 ┆ 1329.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.542367 ┆ -4.540266 ┆ -4.410762 │ │ std ┆ 13.915317 ┆ 12.205725 ┆ 11.972651 │ │ min ┆ -55.354108 ┆ -41.999281 ┆ -41.011209 │ │ 25% ┆ -12.956951 ┆ -12.273657 ┆ -12.285328 │ │ 50% ┆ -2.148085 ┆ -0.921354 ┆ -0.88534 │ │ 75% ┆ 5.935527 ┆ 5.496126 ┆ 5.428947 │ │ max ┆ 26.213084 ┆ 15.445 ┆ 13.22821 │ └────────────┴────────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 3830.0 ┆ 3830.0 ┆ 1329.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.542367 ┆ -4.540266 ┆ -4.410762 │ │ std ┆ 13.915317 ┆ 12.205725 ┆ 11.972651 │ │ min ┆ -55.354108 ┆ -41.999281 ┆ -41.011209 │ │ 25% ┆ -12.956951 ┆ -12.273657 ┆ -12.285328 │ │ 50% ┆ -2.148085 ┆ -0.921354 ┆ -0.88534 │ │ 75% ┆ 5.935527 ┆ 5.496126 ┆ 5.428947 │ │ max ┆ 26.213084 ┆ 15.445 ┆ 13.22821 │ └────────────┴────────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm2']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 3303 ┆ 4 │ │ 1258 ┆ 3 │ │ 1462 ┆ 3 │ │ 2400 ┆ 3 │ │ 2501 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm2']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm2 ┆ ┆ 5159 ┆ 5159 ┆ -4.48 ┆ -4.48 ┆ 13.86 ┆ -24.87 ┆ -12.75 ┆ -2.116 ┆ 5.965 ┆ 11.43 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 0 ┆ 3830 ┆ 3830 ┆ -4.542 ┆ -4.542 ┆ 13.92 ┆ -25.1 ┆ -12.96 ┆ -2.151 ┆ 5.936 ┆ 11.51 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 1 ┆ 1329 ┆ 1329 ┆ -4.3 ┆ -4.3 ┆ 13.68 ┆ -24.31 ┆ -12.08 ┆ -1.97 ┆ 6.003 ┆ 11.38 ┆ -47.54 ┆ 24.31 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm2), lit(value=0, dtype=None)).alias(name=var_gbm2)
Calling recalculate_interaction
Calling square_var
Imputation using LightGBM
Running LightGBM for q=0.25
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.25, 'seed': 3683905913}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.25 prediction: 0.960
Running LightGBM for q=0.5
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.5, 'seed': 1872215464}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.5 prediction: 0.962
Running LightGBM for q=0.75
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.75, 'seed': 3236850685}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.75 prediction: 0.962
┌──────────┬─────────┬───────┬─────────────┬────────┬───────┬────────┬─────────┬───────┐ │ Variable ┆ Sample ┆ n ┆ n (missing) ┆ mean ┆ std ┆ q25 ┆ q50 ┆ q75 │ ╞══════════╪═════════╪═══════╪═════════════╪════════╪═══════╪════════╪═════════╪═══════╡ │ p0.25 ┆ Model ┆ 3,800 ┆ 0 ┆ -6.252 ┆ 13.36 ┆ -14.47 ┆ -4.026 ┆ 4.158 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -5.493 ┆ 13.1 ┆ -12.48 ┆ -3.02 ┆ 4.577 │ │ p0.5 ┆ Model ┆ 3,800 ┆ 0 ┆ -4.903 ┆ 13.39 ┆ -13.12 ┆ -2.43 ┆ 5.292 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -4.046 ┆ 13.07 ┆ -11.02 ┆ -1.277 ┆ 5.674 │ │ p0.75 ┆ Model ┆ 3,800 ┆ 0 ┆ -3.543 ┆ 13.3 ┆ -11.84 ┆ -0.7401 ┆ 6.51 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -2.66 ┆ 12.91 ┆ -9.523 ┆ 0.02555 ┆ 6.992 │ └──────────┴─────────┴───────┴─────────────┴────────┴───────┴────────┴─────────┴───────┘
Running LightGBM for the mean for estimating the marginal distribution
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'seed': 929078180}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm2 ┆ 0.8917 ┆ 0.152 ┆ 0 ┆ -4.725 ┆ 0 ┆ -3.772 │ │ var4 ┆ 0.02018 ┆ 0.1313 ┆ 0 ┆ 0.5041 ┆ 0 ┆ 0.5 │ │ var3 ┆ 0.01591 ┆ 0.09044 ┆ 0 ┆ 25.28 ┆ 0 ┆ 25.43 │ │ unrelated_3 ┆ 0.01582 ┆ 0.1258 ┆ 0 ┆ 0.4954 ┆ 0 ┆ 0.4959 │ │ unrelated_1 ┆ 0.01527 ┆ 0.1273 ┆ 0 ┆ 0.4973 ┆ 0 ┆ 0.5153 │ │ unrelated_5 ┆ 0.01488 ┆ 0.1216 ┆ 0 ┆ 0.4924 ┆ 0 ┆ 0.5078 │ │ unrelated_2 ┆ 0.01404 ┆ 0.1284 ┆ 0 ┆ 0.4967 ┆ 0 ┆ 0.5069 │ │ unrelated_4 ┆ 0.01167 ┆ 0.1181 ┆ 0 ┆ 0.5002 ┆ 0 ┆ 0.4964 │ │ var5 ┆ 0.0005509 ┆ 0.00495 ┆ 0 ┆ 0.1384 ┆ 0 ┆ 0.1402 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 3) ┌────────────┬──────────────┬────────────────┐ │ statistic ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 │ ╞════════════╪══════════════╪════════════════╡ │ count ┆ 3780.0 ┆ 1377.0 │ │ null_count ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.651083 ┆ -3.790008 │ │ std ┆ 13.548941 ┆ 12.980755 │ │ min ┆ -54.556417 ┆ -46.164594 │ │ 25% ┆ -13.117661 ┆ -10.949918 │ │ 50% ┆ -2.320233 ┆ -1.279349 │ │ 75% ┆ 5.680911 ┆ 5.895982 │ │ max ┆ 24.698852 ┆ 20.244597 │ └────────────┴──────────────┴────────────────┘
Correlation between var_gbm3 and prediction: 0.983
Finding 10 nearest neighbors on ['___yhat']
Randomly picking one and donating ['var_gbm3']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 269 ┆ 4 │ │ 558 ┆ 4 │ │ 1490 ┆ 4 │ │ 7879 ┆ 4 │ │ 1065 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm3']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm3_3)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm3 ┆ ┆ 5157 ┆ 5157 ┆ -4.506 ┆ -4.506 ┆ 13.68 ┆ -24.37 ┆ -12.85 ┆ -2.133 ┆ 5.859 ┆ 11.04 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 0 ┆ 3780 ┆ 3780 ┆ -4.73 ┆ -4.73 ┆ 13.84 ┆ -24.68 ┆ -13.44 ┆ -2.471 ┆ 5.784 ┆ 11.19 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 1 ┆ 1377 ┆ 1377 ┆ -3.891 ┆ -3.891 ┆ 13.22 ┆ -23.34 ┆ -11.07 ┆ -1.65 ┆ 6.149 ┆ 10.73 ┆ -48.11 ┆ 20.03 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm3), lit(value=0, dtype=None)).alias(name=var_gbm3)
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/2.srmi.implicate
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'binary', 'num_leaves': 187, 'min_data_in_leaf': 24, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 58, 'bagging_fraction': 0.7790472750239636, 'bagging_freq': 4, 'seed': 3536826232}
Iterations: 155
Model: var_gbm1=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm2, var_gbm3, bbweight__1)
Categorical features: ['var5']
┌─────────────┬──────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var5 ┆ 0.5335 ┆ 0.04183 ┆ 0 ┆ 0.4999 ┆ 0 ┆ 0.507 │ │ var_gbm2 ┆ 0.2494 ┆ 0.04021 ┆ 0 ┆ -2.311 ┆ 0 ┆ -2.215 │ │ var_gbm3 ┆ 0.15 ┆ 0.0346 ┆ 0 ┆ -2.324 ┆ 0 ┆ -2.327 │ │ var3 ┆ 0.01305 ┆ 0.1597 ┆ 0 ┆ 25.11 ┆ 0 ┆ 25.05 │ │ unrelated_2 ┆ 0.01095 ┆ 0.1253 ┆ 0 ┆ 0.5001 ┆ 0 ┆ 0.5019 │ │ unrelated_1 ┆ 0.01062 ┆ 0.1293 ┆ 0 ┆ 0.5024 ┆ 0 ┆ 0.5015 │ │ unrelated_5 ┆ 0.009145 ┆ 0.116 ┆ 0 ┆ 0.4988 ┆ 0 ┆ 0.5053 │ │ unrelated_4 ┆ 0.00866 ┆ 0.1181 ┆ 0 ┆ 0.5007 ┆ 0 ┆ 0.5004 │ │ unrelated_3 ┆ 0.008155 ┆ 0.1106 ┆ 0 ┆ 0.4992 ┆ 0 ┆ 0.4968 │ │ var4 ┆ 0.006495 ┆ 0.1243 ┆ 0 ┆ 0.5057 ┆ 0 ┆ 0.5103 │ └─────────────┴──────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 10000.0 ┆ 10000.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.5298 ┆ 0.527404 ┆ 0.530729 │ │ std ┆ null ┆ 0.498864 ┆ 0.498213 │ │ min ┆ 0.0 ┆ 1.1741e-7 ┆ 1.7849e-7 │ │ 25% ┆ null ┆ 0.000004 ┆ 0.000004 │ │ 50% ┆ null ┆ 0.999955 ┆ 0.99848 │ │ 75% ┆ null ┆ 0.999997 ┆ 0.999997 │ │ max ┆ 1.0 ┆ 1.0 ┆ 1.0 │ └────────────┴──────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬──────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm1 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪══════════╪══════════════╪════════════════╡ │ count ┆ 10000.0 ┆ 10000.0 ┆ 2483.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ 0.5298 ┆ 0.527404 ┆ 0.530729 │ │ std ┆ null ┆ 0.498864 ┆ 0.498213 │ │ min ┆ 0.0 ┆ 1.1741e-7 ┆ 1.7849e-7 │ │ 25% ┆ null ┆ 0.000004 ┆ 0.000004 │ │ 50% ┆ null ┆ 0.999955 ┆ 0.99848 │ │ 75% ┆ null ┆ 0.999997 ┆ 0.999997 │ │ max ┆ 1.0 ┆ 1.0 ┆ 1.0 │ └────────────┴──────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm1']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 2996 ┆ 4 │ │ 4700 ┆ 4 │ │ 5916 ┆ 4 │ │ 883 ┆ 3 │ │ 1161 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm1']
Where: None
Where (impute): col(___imp_missing_var_gbm1_1)
┌──────────┬─────────┬───────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪═══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm1 ┆ ┆ 12483 ┆ 12483 ┆ 0.5303 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 0 ┆ 10000 ┆ 10000 ┆ 0.5298 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ var_gbm1 ┆ 1 ┆ 2483 ┆ 2483 ┆ 0.5324 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ └──────────┴─────────┴───────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Imputation using LightGBM
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 31, 'min_data_in_leaf': 12, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 210, 'bagging_fraction': 0.764136724165049, 'bagging_freq': 2, 'seed': 1027470263}
Iterations: 48
Model: var_gbm2=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm3, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm3 ┆ 0.9371 ┆ 0.2486 ┆ 0 ┆ -4.492 ┆ 0 ┆ -4.39 │ │ var4 ┆ 0.01645 ┆ 0.1625 ┆ 0 ┆ 0.503 ┆ 0 ┆ 0.5026 │ │ var3 ┆ 0.01336 ┆ 0.1208 ┆ 0 ┆ 25.3 ┆ 0 ┆ 25.32 │ │ unrelated_5 ┆ 0.007613 ┆ 0.09931 ┆ 0 ┆ 0.497 ┆ 0 ┆ 0.4893 │ │ unrelated_1 ┆ 0.007501 ┆ 0.09444 ┆ 0 ┆ 0.502 ┆ 0 ┆ 0.4967 │ │ unrelated_3 ┆ 0.007421 ┆ 0.1042 ┆ 0 ┆ 0.4965 ┆ 0 ┆ 0.4939 │ │ unrelated_2 ┆ 0.00575 ┆ 0.09167 ┆ 0 ┆ 0.4991 ┆ 0 ┆ 0.5013 │ │ unrelated_4 ┆ 0.004442 ┆ 0.07361 ┆ 0 ┆ 0.4992 ┆ 0 ┆ 0.5031 │ │ var5 ┆ 0.0003558 ┆ 0.004861 ┆ 0 ┆ 0.1388 ┆ 0 ┆ 0.1383 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 5159.0 ┆ 5159.0 ┆ 1330.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.480018 ┆ -4.54218 ┆ -4.438259 │ │ std ┆ 13.855245 ┆ 13.250079 ┆ 12.875819 │ │ min ┆ -55.354108 ┆ -45.260551 ┆ -44.289345 │ │ 25% ┆ -12.722219 ┆ -12.686983 ┆ -12.760332 │ │ 50% ┆ -2.116323 ┆ -1.808201 ┆ -1.603065 │ │ 75% ┆ 5.96493 ┆ 5.64095 ┆ 5.458843 │ │ max ┆ 26.213084 ┆ 20.256866 ┆ 20.256866 │ └────────────┴────────────┴──────────────┴────────────────┘
shape: (9, 4) ┌────────────┬────────────┬──────────────┬────────────────┐ │ statistic ┆ var_gbm2 ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞════════════╪════════════╪══════════════╪════════════════╡ │ count ┆ 5159.0 ┆ 5159.0 ┆ 1330.0 │ │ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.480018 ┆ -4.54218 ┆ -4.438259 │ │ std ┆ 13.855245 ┆ 13.250079 ┆ 12.875819 │ │ min ┆ -55.354108 ┆ -45.260551 ┆ -44.289345 │ │ 25% ┆ -12.722219 ┆ -12.686983 ┆ -12.760332 │ │ 50% ┆ -2.116323 ┆ -1.808201 ┆ -1.603065 │ │ 75% ┆ 5.96493 ┆ 5.64095 ┆ 5.458843 │ │ max ┆ 26.213084 ┆ 20.256866 ┆ 20.256866 │ └────────────┴────────────┴──────────────┴────────────────┘
Finding 10 nearest neighbors on ['___prediction']
Randomly picking one and donating ['var_gbm2']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 116 ┆ 3 │ │ 406 ┆ 3 │ │ 757 ┆ 3 │ │ 1621 ┆ 3 │ │ 2411 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm2']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm2_2)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm2 ┆ ┆ 6489 ┆ 6489 ┆ -4.478 ┆ -4.478 ┆ 13.75 ┆ -24.55 ┆ -12.66 ┆ -2.11 ┆ 5.912 ┆ 11.38 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 0 ┆ 5159 ┆ 5159 ┆ -4.48 ┆ -4.48 ┆ 13.86 ┆ -24.87 ┆ -12.75 ┆ -2.116 ┆ 5.965 ┆ 11.43 ┆ -55.35 ┆ 26.21 │ │ var_gbm2 ┆ 1 ┆ 1330 ┆ 1330 ┆ -4.469 ┆ -4.469 ┆ 13.36 ┆ -23.55 ┆ -12.51 ┆ -2.048 ┆ 5.694 ┆ 10.97 ┆ -45.64 ┆ 24.05 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm2), lit(value=0, dtype=None)).alias(name=var_gbm2)
Calling recalculate_interaction
Calling square_var
Imputation using LightGBM
Running LightGBM for q=0.25
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.25, 'seed': 855549572}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.25 prediction: 0.979
Running LightGBM for q=0.5
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.5, 'seed': 2708163395}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.5 prediction: 0.982
Running LightGBM for q=0.75
Running lightgbm model with parameters: {'objective': 'quantile', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'alpha': 0.75, 'seed': 1105109657}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
Correlation between var_gbm3 and q=0.75 prediction: 0.982
┌──────────┬─────────┬───────┬─────────────┬────────┬───────┬────────┬─────────┬───────┐ │ Variable ┆ Sample ┆ n ┆ n (missing) ┆ mean ┆ std ┆ q25 ┆ q50 ┆ q75 │ ╞══════════╪═════════╪═══════╪═════════════╪════════╪═══════╪════════╪═════════╪═══════╡ │ p0.25 ┆ Model ┆ 5,200 ┆ 0 ┆ -5.664 ┆ 13.47 ┆ -13.68 ┆ -3.339 ┆ 4.676 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -5.266 ┆ 13.34 ┆ -12.48 ┆ -2.783 ┆ 4.651 │ │ p0.5 ┆ Model ┆ 5,200 ┆ 0 ┆ -4.744 ┆ 13.46 ┆ -12.98 ┆ -2.436 ┆ 5.469 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -4.198 ┆ 13.23 ┆ -11.34 ┆ -1.786 ┆ 5.713 │ │ p0.75 ┆ Model ┆ 5,200 ┆ 0 ┆ -3.806 ┆ 13.4 ┆ -12.01 ┆ -1.26 ┆ 6.258 │ │ ┆ Imputed ┆ 1,400 ┆ 0 ┆ -3.124 ┆ 13.14 ┆ -10.47 ┆ -0.4171 ┆ 6.564 │ └──────────┴─────────┴───────┴─────────────┴────────┴───────┴────────┴─────────┴───────┘
Running LightGBM for the mean for estimating the marginal distribution
Running lightgbm model with parameters: {'objective': 'regression', 'num_leaves': 254, 'min_data_in_leaf': 10, 'boosting': 'gbdt', 'verbose': -1, 'metric': 'rmse', 'min_data_per_group': 25, 'num_threads': 1, 'max_depth': 16, 'bagging_fraction': 0.8606968457336602, 'bagging_freq': 3, 'seed': 2643076450}
Iterations: 155
Model: var_gbm3=f(var3, var4, var5, unrelated_1, unrelated_2, unrelated_3, unrelated_4, unrelated_5, repeat_1, var_gbm1, var_gbm2, bbweight__1)
Categorical features: ['var5']
┌─────────────┬───────────┬───────────┬─────────────────┬────────┬─────────────────┬────────┐ │ Feature ┆ Gain ┆ Frequency ┆ Model ┆ Model ┆ Impute ┆ Impute │ │ ┆ ┆ ┆ share (missing) ┆ mean ┆ share (missing) ┆ mean │ ╞═════════════╪═══════════╪═══════════╪═════════════════╪════════╪═════════════════╪════════╡ │ var_gbm2 ┆ 0.9483 ┆ 0.1382 ┆ 0 ┆ -4.52 ┆ 0 ┆ -3.874 │ │ var4 ┆ 0.01315 ┆ 0.1378 ┆ 0 ┆ 0.503 ┆ 0 ┆ 0.5 │ │ unrelated_1 ┆ 0.007578 ┆ 0.1326 ┆ 0 ┆ 0.5021 ┆ 0 ┆ 0.5154 │ │ unrelated_5 ┆ 0.006756 ┆ 0.1226 ┆ 0 ┆ 0.4965 ┆ 0 ┆ 0.5078 │ │ unrelated_2 ┆ 0.006338 ┆ 0.1245 ┆ 0 ┆ 0.4993 ┆ 0 ┆ 0.5067 │ │ unrelated_3 ┆ 0.00619 ┆ 0.1277 ┆ 0 ┆ 0.4956 ┆ 0 ┆ 0.4959 │ │ unrelated_4 ┆ 0.005837 ┆ 0.1195 ┆ 0 ┆ 0.4993 ┆ 0 ┆ 0.4967 │ │ var3 ┆ 0.005469 ┆ 0.09132 ┆ 0 ┆ 25.33 ┆ 0 ┆ 25.46 │ │ var5 ┆ 0.0003827 ┆ 0.005703 ┆ 0 ┆ 0.1387 ┆ 0 ┆ 0.1402 │ └─────────────┴───────────┴───────────┴─────────────────┴────────┴─────────────────┴────────┘
Predictions
shape: (9, 3) ┌────────────┬──────────────┬────────────────┐ │ statistic ┆ Model (yhat) ┆ Imputed (yhat) │ │ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 │ ╞════════════╪══════════════╪════════════════╡ │ count ┆ 5156.0 ┆ 1377.0 │ │ null_count ┆ 0.0 ┆ 0.0 │ │ mean ┆ -4.535234 ┆ -3.946013 │ │ std ┆ 13.590616 ┆ 13.27388 │ │ min ┆ -55.016256 ┆ -47.798694 │ │ 25% ┆ -12.719967 ┆ -11.328449 │ │ 50% ┆ -2.024334 ┆ -1.46454 │ │ 75% ┆ 5.736683 ┆ 5.846448 │ │ max ┆ 26.153377 ┆ 20.521069 │ └────────────┴──────────────┴────────────────┘
Correlation between var_gbm3 and prediction: 0.991
Finding 10 nearest neighbors on ['___yhat']
Randomly picking one and donating ['var_gbm3']
Most common matches:
shape: (5, 2) ┌───────┬─────────┐ │ index ┆ nDonors │ │ --- ┆ --- │ │ i16 ┆ i8 │ ╞═══════╪═════════╡ │ 0 ┆ 3 │ │ 15 ┆ 3 │ │ 2644 ┆ 3 │ │ 3175 ┆ 3 │ │ 6130 ┆ 3 │ └───────┴─────────┘
Post-imputation statistics for ['var_gbm3']
Where: col(var_gbm1)
Where (impute): col(___imp_missing_var_gbm3_3)
┌──────────┬─────────┬──────┬──────────────┬────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐ │ Variable ┆ Imputed ┆ n ┆ n (not null) ┆ mean ┆ mean (not 0) ┆ std (not 0) ┆ q10 (not 0) ┆ q25 (not 0) ┆ q50 (not 0) ┆ q75 (not 0) ┆ q90 (not 0) ┆ min (not 0) ┆ max (not 0) │ ╞══════════╪═════════╪══════╪══════════════╪════════╪══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡ │ var_gbm3 ┆ ┆ 6533 ┆ 6533 ┆ -4.375 ┆ -4.375 ┆ 13.61 ┆ -24.14 ┆ -12.4 ┆ -1.984 ┆ 5.912 ┆ 11.04 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 0 ┆ 5156 ┆ 5156 ┆ -4.507 ┆ -4.507 ┆ 13.68 ┆ -24.37 ┆ -12.85 ┆ -2.133 ┆ 5.862 ┆ 11.04 ┆ -55.35 ┆ 26.21 │ │ var_gbm3 ┆ 1 ┆ 1377 ┆ 1377 ┆ -3.883 ┆ -3.883 ┆ 13.34 ┆ -23.13 ┆ -11.07 ┆ -1.494 ┆ 6.085 ┆ 11.04 ┆ -48.11 ┆ 22.89 │ └──────────┴─────────┴──────┴──────────────┴────────┴──────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
Updating data according to narwhals expression: when_then(all_horizontal(col(var_gbm1), ignore_nulls=False), col(var_gbm3), lit(value=0, dtype=None)).alias(name=var_gbm3)
var_gbm1
var_gbm2
var_gbm3
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_gbm.srmi/2.srmi.implicate
It's automatically saved and can be loaded with (see path_model above):
path_model = f'{config.path_temp_files}/py_srmi_test_gbm'
srmi = SRMI.load(path_model)
In [8]:
logger.info("Get the results")
_ = df_list = srmi.df_implicates
Get the results
In [9]:
logger.info("\n\nLook at the original")
_ = summary(df_original,detailed=True,drb_round=True)
logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary,detailed=True,drb_round=True)
logger.info("\n\nLook at the imputes | var_gbm1 == 0")
_ = df_list.filter(~nw.col("var_gbm1")).pipe(summary,detailed=True,drb_round=True)
logger.info("\n\nLook at the imputes | var_gbm1 == 1")
_ = df_list.filter(nw.col("var_gbm1")).pipe(summary,detailed=True,drb_round=True)
Look at the original
┌──────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞══════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ index ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,018.0 ┆ 1.416 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.514 ┆ 3.432 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.978 ┆ 3.155 ┆ 0.0 ┆ 2.0 ┆ 5.0 ┆ 8.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.11 ┆ 14.75 ┆ 0.0 ┆ 12.0 ┆ 25.0 ┆ 38.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.5057 ┆ 0.2879 ┆ 0.000027 ┆ 0.2559 ┆ 0.5086 ┆ 0.7543 ┆ 1.0 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.5 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.5024 ┆ 0.2884 ┆ 0.0001191 ┆ 0.253 ┆ 0.5037 ┆ 0.753 ┆ 1.0 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.5001 ┆ 0.2876 ┆ 0.000049 ┆ 0.2531 ┆ 0.4975 ┆ 0.7487 ┆ 0.9995 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.4992 ┆ 0.2888 ┆ 0.000129 ┆ 0.2513 ┆ 0.4961 ┆ 0.7508 ┆ 0.9999 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.5007 ┆ 0.2887 ┆ 0.0001329 ┆ 0.2501 ┆ 0.5006 ┆ 0.752 ┆ 1.0 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.4988 ┆ 0.289 ┆ 0.000071 ┆ 0.2496 ┆ 0.4969 ┆ 0.75 ┆ 0.9999 │ │ missing_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5026 ┆ 0.289 ┆ 0.000006 ┆ 0.2517 ┆ 0.5074 ┆ 0.7523 ┆ 1.0 │ │ missing_gbm2 ┆ 10,000 ┆ 0 ┆ 0.5032 ┆ 0.2905 ┆ 0.000005 ┆ 0.2531 ┆ 0.5041 ┆ 0.756 ┆ 1.0 │ │ missing_gbm3 ┆ 10,000 ┆ 0 ┆ 0.4939 ┆ 0.2907 ┆ 0.0001079 ┆ 0.2408 ┆ 0.4907 ┆ 0.7412 ┆ 0.9998 │ │ var_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5229 ┆ 0.4995 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ var_gbm2 ┆ 10,000 ┆ 0 ┆ -2.402 ┆ 10.33 ┆ -55.35 ┆ -3.023 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm3 ┆ 10,000 ┆ 0 ┆ -2.402 ┆ 10.33 ┆ -55.35 ┆ -3.023 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ └──────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
Look at the imputes
┌─────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,018.0 ┆ 1.416 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.514 ┆ 3.432 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.978 ┆ 3.155 ┆ 0.0 ┆ 2.0 ┆ 5.0 ┆ 8.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.11 ┆ 14.75 ┆ 0.0 ┆ 12.0 ┆ 25.0 ┆ 38.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.5057 ┆ 0.2879 ┆ 0.000027 ┆ 0.2559 ┆ 0.5086 ┆ 0.7543 ┆ 1.0 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.5 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.5024 ┆ 0.2884 ┆ 0.0001191 ┆ 0.253 ┆ 0.5037 ┆ 0.753 ┆ 1.0 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.5001 ┆ 0.2876 ┆ 0.000049 ┆ 0.2531 ┆ 0.4975 ┆ 0.7487 ┆ 0.9995 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.4992 ┆ 0.2888 ┆ 0.000129 ┆ 0.2513 ┆ 0.4961 ┆ 0.7508 ┆ 0.9999 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.5007 ┆ 0.2887 ┆ 0.0001329 ┆ 0.2501 ┆ 0.5006 ┆ 0.752 ┆ 1.0 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.4988 ┆ 0.289 ┆ 0.000071 ┆ 0.2496 ┆ 0.4969 ┆ 0.75 ┆ 0.9999 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.5024 ┆ 0.2884 ┆ 0.0001191 ┆ 0.253 ┆ 0.5037 ┆ 0.753 ┆ 1.0 │ │ var_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5272 ┆ 0.4993 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ var_gbm2 ┆ 10,000 ┆ 0 ┆ -2.406 ┆ 10.2 ┆ -55.35 ┆ -2.753 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm3 ┆ 10,000 ┆ 0 ┆ -2.363 ┆ 10.15 ┆ -55.35 ┆ -2.785 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm12 ┆ 10,000 ┆ 0 ┆ -2.406 ┆ 10.2 ┆ -55.35 ┆ -2.753 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm2_sq ┆ 10,000 ┆ 0 ┆ 109.8 ┆ 271.1 ┆ 0.0 ┆ 0.0 ┆ 0.2379 ┆ 80.29 ┆ 3,064.0 │ └─────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
┌─────────────┬────────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪════════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 5,000.0 ┆ 2,887.0 ┆ 0.0 ┆ 2,500.0 ┆ 5,000.0 ┆ 7,500.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,018.0 ┆ 1.416 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.514 ┆ 3.432 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.978 ┆ 3.155 ┆ 0.0 ┆ 2.0 ┆ 5.0 ┆ 8.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.11 ┆ 14.75 ┆ 0.0 ┆ 12.0 ┆ 25.0 ┆ 38.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.5057 ┆ 0.2879 ┆ 0.000027 ┆ 0.2559 ┆ 0.5086 ┆ 0.7543 ┆ 1.0 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.5 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.5024 ┆ 0.2884 ┆ 0.0001191 ┆ 0.253 ┆ 0.5037 ┆ 0.753 ┆ 1.0 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.5001 ┆ 0.2876 ┆ 0.000049 ┆ 0.2531 ┆ 0.4975 ┆ 0.7487 ┆ 0.9995 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.4992 ┆ 0.2888 ┆ 0.000129 ┆ 0.2513 ┆ 0.4961 ┆ 0.7508 ┆ 0.9999 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.5007 ┆ 0.2887 ┆ 0.0001329 ┆ 0.2501 ┆ 0.5006 ┆ 0.752 ┆ 1.0 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.4988 ┆ 0.289 ┆ 0.000071 ┆ 0.2496 ┆ 0.4969 ┆ 0.75 ┆ 0.9999 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.5024 ┆ 0.2884 ┆ 0.0001191 ┆ 0.253 ┆ 0.5037 ┆ 0.753 ┆ 1.0 │ │ var_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5276 ┆ 0.4993 ┆ 0.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ var_gbm2 ┆ 10,000 ┆ 0 ┆ -2.334 ┆ 10.15 ┆ -55.35 ┆ -2.737 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm3 ┆ 10,000 ┆ 0 ┆ -2.323 ┆ 10.1 ┆ -55.35 ┆ -2.745 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm12 ┆ 10,000 ┆ 0 ┆ -2.334 ┆ 10.15 ┆ -55.35 ┆ -2.737 ┆ 0.0 ┆ 0.0 ┆ 26.21 │ │ var_gbm2_sq ┆ 10,000 ┆ 0 ┆ 108.4 ┆ 266.6 ┆ 0.0 ┆ 0.0 ┆ 0.1858 ┆ 80.87 ┆ 3,064.0 │ └─────────────┴────────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
Look at the imputes | var_gbm1 == 0
┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 4,700 ┆ 0 ┆ 5,060.0 ┆ 2,893.0 ┆ 1.0 ┆ 2,572.0 ┆ 5,084.0 ┆ 7,631.0 ┆ 9,997.0 │ │ _row_index_ ┆ 4,700 ┆ 0 ┆ 5,060.0 ┆ 2,893.0 ┆ 1.0 ┆ 2,572.0 ┆ 5,084.0 ┆ 7,631.0 ┆ 9,997.0 │ │ year ┆ 4,700 ┆ 0 ┆ 2,018.0 ┆ 1.417 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 4,700 ┆ 0 ┆ 6.553 ┆ 3.416 ┆ 1.0 ┆ 4.0 ┆ 7.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 4,700 ┆ 0 ┆ 4.626 ┆ 3.238 ┆ 0.0 ┆ 2.0 ┆ 4.0 ┆ 7.0 ┆ 10.0 │ │ var3 ┆ 4,700 ┆ 0 ┆ 24.79 ┆ 13.02 ┆ 0.0 ┆ 14.0 ┆ 25.0 ┆ 36.0 ┆ 50.0 │ │ var4 ┆ 4,700 ┆ 0 ┆ 0.5086 ┆ 0.2885 ┆ 0.000027 ┆ 0.2545 ┆ 0.5123 ┆ 0.7586 ┆ 1.0 │ │ var5 ┆ 4,700 ┆ 0 ┆ 0.8997 ┆ 0.3004 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ unrelated_1 ┆ 4,700 ┆ 0 ┆ 0.5033 ┆ 0.2868 ┆ 0.0001191 ┆ 0.2579 ┆ 0.5035 ┆ 0.7517 ┆ 1.0 │ │ unrelated_2 ┆ 4,700 ┆ 0 ┆ 0.502 ┆ 0.2866 ┆ 0.000079 ┆ 0.262 ┆ 0.4973 ┆ 0.7501 ┆ 0.9995 │ │ unrelated_3 ┆ 4,700 ┆ 0 ┆ 0.5016 ┆ 0.2895 ┆ 0.0001387 ┆ 0.2521 ┆ 0.4987 ┆ 0.755 ┆ 0.9999 │ │ unrelated_4 ┆ 4,700 ┆ 0 ┆ 0.5014 ┆ 0.2898 ┆ 0.0001414 ┆ 0.2506 ┆ 0.5012 ┆ 0.7568 ┆ 0.9999 │ │ unrelated_5 ┆ 4,700 ┆ 0 ┆ 0.5018 ┆ 0.2917 ┆ 0.000071 ┆ 0.2498 ┆ 0.4999 ┆ 0.7542 ┆ 0.9998 │ │ repeat_1 ┆ 4,700 ┆ 0 ┆ 0.5033 ┆ 0.2868 ┆ 0.0001191 ┆ 0.2579 ┆ 0.5035 ┆ 0.7517 ┆ 1.0 │ │ var_gbm1 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm2 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm3 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm12 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm2_sq ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ └─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 4,700 ┆ 0 ┆ 5,066.0 ┆ 2,896.0 ┆ 1.0 ┆ 2,576.0 ┆ 5,090.0 ┆ 7,645.0 ┆ 9,997.0 │ │ _row_index_ ┆ 4,700 ┆ 0 ┆ 5,066.0 ┆ 2,896.0 ┆ 1.0 ┆ 2,576.0 ┆ 5,090.0 ┆ 7,645.0 ┆ 9,997.0 │ │ year ┆ 4,700 ┆ 0 ┆ 2,018.0 ┆ 1.418 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 4,700 ┆ 0 ┆ 6.546 ┆ 3.415 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 4,700 ┆ 0 ┆ 4.617 ┆ 3.238 ┆ 0.0 ┆ 2.0 ┆ 4.0 ┆ 7.0 ┆ 10.0 │ │ var3 ┆ 4,700 ┆ 0 ┆ 24.94 ┆ 12.99 ┆ 0.0 ┆ 14.0 ┆ 25.0 ┆ 36.0 ┆ 50.0 │ │ var4 ┆ 4,700 ┆ 0 ┆ 0.5088 ┆ 0.2885 ┆ 0.000027 ┆ 0.2543 ┆ 0.5137 ┆ 0.7586 ┆ 1.0 │ │ var5 ┆ 4,700 ┆ 0 ┆ 0.8984 ┆ 0.3022 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ unrelated_1 ┆ 4,700 ┆ 0 ┆ 0.5032 ┆ 0.2869 ┆ 0.0001191 ┆ 0.2578 ┆ 0.5029 ┆ 0.7526 ┆ 1.0 │ │ unrelated_2 ┆ 4,700 ┆ 0 ┆ 0.5017 ┆ 0.2866 ┆ 0.000079 ┆ 0.2621 ┆ 0.4961 ┆ 0.7502 ┆ 0.9995 │ │ unrelated_3 ┆ 4,700 ┆ 0 ┆ 0.5031 ┆ 0.2897 ┆ 0.0001387 ┆ 0.2534 ┆ 0.501 ┆ 0.7574 ┆ 0.9999 │ │ unrelated_4 ┆ 4,700 ┆ 0 ┆ 0.5012 ┆ 0.2898 ┆ 0.0001414 ┆ 0.2504 ┆ 0.5008 ┆ 0.7558 ┆ 0.9999 │ │ unrelated_5 ┆ 4,700 ┆ 0 ┆ 0.5016 ┆ 0.2915 ┆ 0.000071 ┆ 0.2495 ┆ 0.4995 ┆ 0.7551 ┆ 0.9998 │ │ repeat_1 ┆ 4,700 ┆ 0 ┆ 0.5032 ┆ 0.2869 ┆ 0.0001191 ┆ 0.2578 ┆ 0.5029 ┆ 0.7526 ┆ 1.0 │ │ var_gbm1 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm2 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm3 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm12 ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ │ var_gbm2_sq ┆ 4,700 ┆ 0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ └─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
Look at the imputes | var_gbm1 == 1
┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 5,300 ┆ 0 ┆ 4,945.0 ┆ 2,880.0 ┆ 0.0 ┆ 2,453.0 ┆ 4,920.0 ┆ 7,400.0 ┆ 9,999.0 │ │ _row_index_ ┆ 5,300 ┆ 0 ┆ 4,945.0 ┆ 2,880.0 ┆ 0.0 ┆ 2,453.0 ┆ 4,920.0 ┆ 7,400.0 ┆ 9,999.0 │ │ year ┆ 5,300 ┆ 0 ┆ 2,018.0 ┆ 1.415 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 5,300 ┆ 0 ┆ 6.479 ┆ 3.446 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 5,300 ┆ 0 ┆ 5.294 ┆ 3.044 ┆ 0.0 ┆ 3.0 ┆ 5.0 ┆ 8.0 ┆ 10.0 │ │ var3 ┆ 5,300 ┆ 0 ┆ 25.39 ┆ 16.15 ┆ 0.0 ┆ 10.0 ┆ 26.0 ┆ 40.0 ┆ 50.0 │ │ var4 ┆ 5,300 ┆ 0 ┆ 0.503 ┆ 0.2873 ┆ 0.0001044 ┆ 0.2569 ┆ 0.5071 ┆ 0.7521 ┆ 0.9999 │ │ var5 ┆ 5,300 ┆ 0 ┆ 0.1413 ┆ 0.3484 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 5,300 ┆ 0 ┆ 0.5017 ┆ 0.2898 ┆ 0.0002485 ┆ 0.2506 ┆ 0.5043 ┆ 0.7562 ┆ 0.9993 │ │ unrelated_2 ┆ 5,300 ┆ 0 ┆ 0.4984 ┆ 0.2886 ┆ 0.000049 ┆ 0.2462 ┆ 0.4976 ┆ 0.7463 ┆ 0.9995 │ │ unrelated_3 ┆ 5,300 ┆ 0 ┆ 0.497 ┆ 0.2881 ┆ 0.000129 ┆ 0.2508 ┆ 0.4932 ┆ 0.7473 ┆ 0.9999 │ │ unrelated_4 ┆ 5,300 ┆ 0 ┆ 0.5 ┆ 0.2877 ┆ 0.0001329 ┆ 0.25 ┆ 0.4995 ┆ 0.7475 ┆ 1.0 │ │ unrelated_5 ┆ 5,300 ┆ 0 ┆ 0.496 ┆ 0.2865 ┆ 0.0001807 ┆ 0.2496 ┆ 0.4938 ┆ 0.7454 ┆ 0.9999 │ │ repeat_1 ┆ 5,300 ┆ 0 ┆ 0.5017 ┆ 0.2898 ┆ 0.0002485 ┆ 0.2506 ┆ 0.5043 ┆ 0.7562 ┆ 0.9993 │ │ var_gbm1 ┆ 5,300 ┆ 0 ┆ 1.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ var_gbm2 ┆ 5,300 ┆ 0 ┆ -4.564 ┆ 13.69 ┆ -55.35 ┆ -12.55 ┆ -1.855 ┆ 5.651 ┆ 26.21 │ │ var_gbm3 ┆ 5,300 ┆ 0 ┆ -4.481 ┆ 13.63 ┆ -55.35 ┆ -12.44 ┆ -1.832 ┆ 5.733 ┆ 26.21 │ │ var_gbm12 ┆ 5,300 ┆ 0 ┆ -4.564 ┆ 13.69 ┆ -55.35 ┆ -12.55 ┆ -1.855 ┆ 5.651 ┆ 26.21 │ │ var_gbm2_sq ┆ 5,300 ┆ 0 ┆ 208.3 ┆ 344.9 ┆ 0.0 ┆ 14.31 ┆ 72.18 ┆ 220.9 ┆ 3,064.0 │ └─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘
┌─────────────┬───────┬─────────────┬─────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ q25 ┆ q50 ┆ q75 ┆ max │ ╞═════════════╪═══════╪═════════════╪═════════╪═════════╪═══════════╪═════════╪═════════╪═════════╪═════════╡ │ index ┆ 5,300 ┆ 0 ┆ 4,940.0 ┆ 2,878.0 ┆ 0.0 ┆ 2,445.0 ┆ 4,916.0 ┆ 7,389.0 ┆ 9,999.0 │ │ _row_index_ ┆ 5,300 ┆ 0 ┆ 4,940.0 ┆ 2,878.0 ┆ 0.0 ┆ 2,445.0 ┆ 4,916.0 ┆ 7,389.0 ┆ 9,999.0 │ │ year ┆ 5,300 ┆ 0 ┆ 2,018.0 ┆ 1.415 ┆ 2,016.0 ┆ 2,017.0 ┆ 2,018.0 ┆ 2,019.0 ┆ 2,020.0 │ │ month ┆ 5,300 ┆ 0 ┆ 6.484 ┆ 3.448 ┆ 1.0 ┆ 4.0 ┆ 6.0 ┆ 9.0 ┆ 12.0 │ │ var2 ┆ 5,300 ┆ 0 ┆ 5.301 ┆ 3.042 ┆ 0.0 ┆ 3.0 ┆ 5.0 ┆ 8.0 ┆ 10.0 │ │ var3 ┆ 5,300 ┆ 0 ┆ 25.26 ┆ 16.17 ┆ 0.0 ┆ 10.0 ┆ 25.0 ┆ 40.0 ┆ 50.0 │ │ var4 ┆ 5,300 ┆ 0 ┆ 0.5029 ┆ 0.2873 ┆ 0.0001044 ┆ 0.2569 ┆ 0.5065 ┆ 0.7515 ┆ 0.9999 │ │ var5 ┆ 5,300 ┆ 0 ┆ 0.1431 ┆ 0.3502 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 5,300 ┆ 0 ┆ 0.5018 ┆ 0.2897 ┆ 0.0002485 ┆ 0.2507 ┆ 0.5044 ┆ 0.7547 ┆ 0.9993 │ │ unrelated_2 ┆ 5,300 ┆ 0 ┆ 0.4986 ┆ 0.2886 ┆ 0.000049 ┆ 0.2462 ┆ 0.5002 ┆ 0.7457 ┆ 0.9995 │ │ unrelated_3 ┆ 5,300 ┆ 0 ┆ 0.4956 ┆ 0.2879 ┆ 0.000129 ┆ 0.2479 ┆ 0.4906 ┆ 0.7452 ┆ 0.9999 │ │ unrelated_4 ┆ 5,300 ┆ 0 ┆ 0.5002 ┆ 0.2877 ┆ 0.0001329 ┆ 0.25 ┆ 0.4999 ┆ 0.7493 ┆ 1.0 │ │ unrelated_5 ┆ 5,300 ┆ 0 ┆ 0.4962 ┆ 0.2867 ┆ 0.0001807 ┆ 0.2501 ┆ 0.4948 ┆ 0.7452 ┆ 0.9999 │ │ repeat_1 ┆ 5,300 ┆ 0 ┆ 0.5018 ┆ 0.2897 ┆ 0.0002485 ┆ 0.2507 ┆ 0.5044 ┆ 0.7547 ┆ 0.9993 │ │ var_gbm1 ┆ 5,300 ┆ 0 ┆ 1.0 ┆ 0.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 ┆ 1.0 │ │ var_gbm2 ┆ 5,300 ┆ 0 ┆ -4.424 ┆ 13.64 ┆ -55.35 ┆ -12.44 ┆ -1.753 ┆ 5.73 ┆ 26.21 │ │ var_gbm3 ┆ 5,300 ┆ 0 ┆ -4.402 ┆ 13.57 ┆ -55.35 ┆ -12.4 ┆ -1.796 ┆ 5.694 ┆ 26.21 │ │ var_gbm12 ┆ 5,300 ┆ 0 ┆ -4.424 ┆ 13.64 ┆ -55.35 ┆ -12.44 ┆ -1.753 ┆ 5.73 ┆ 26.21 │ │ var_gbm2_sq ┆ 5,300 ┆ 0 ┆ 205.5 ┆ 338.8 ┆ 0.0 ┆ 14.58 ┆ 72.55 ┆ 223.9 ┆ 3,064.0 │ └─────────────┴───────┴─────────────┴─────────┴─────────┴───────────┴─────────┴─────────┴─────────┴─────────┘