InΒ [1]:
import sys
import os
from pathlib import Path
import narwhals as nw
import polars as pl
import polars.selectors as cs
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.orchestration.config import Config
from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list
InΒ [2]:
# Draw some random data
n_rows = 10_000
impute_share = 0.25
df = (
RandomData(n_rows=n_rows, seed=32565437)
.index("index")
.integer("year", 2016, 2020)
.integer("month", 1, 12)
.integer("var2", 0, 10)
.integer("var3", 0, 50)
.float("var4", 0, 1)
.integer("var5", 0, 1)
.np_distribution("epsilon_hd1", "normal", scale=5)
.np_distribution("epsilon_hd2", "normal", scale=5)
.float("missing_hd1", 0, 1)
.float("missing_hd2", 0, 1)
.to_df()
)
# Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")
c_e_hd1 = pl.col("epsilon_hd1")
c_e_hd2 = pl.col("epsilon_hd2")
logger.info("var_hd1 is binary and conditional on other variables")
c_hd1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_hd1) > 0).alias("var_hd1")
logger.info("var_hd2 is != 0 only if var_hd1 == True")
c_hd2 = (
pl.when(pl.col("var_hd1"))
.then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_hd2)))
.otherwise(pl.lit(0))
.alias("var_hd2")
)
# Create a bunch of variables that are functions of the variables created above
df = (
df.with_columns(c_hd1)
.with_columns(c_hd2)
.drop(columns_from_list(df=df, columns="epsilon*"))
.with_row_index(name="_row_index_")
)
df_original = df
# Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["hd"]:
for i in range(1, 3):
vari = f"var_{prefixi}{i}"
missingi = f"missing_{prefixi}{i}"
clear_missing.append(
pl.when(pl.col(missingi) < impute_share)
.then(pl.lit(None))
.otherwise(pl.col(vari))
.alias(vari)
)
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))
summary(df)
# Actually do the imputation
# The list of variables to impute (eventually)
vars_impute = []
# 1) Impute some variables to impute using stat match/hot deck
modeltype = Variable.ModelType.StatMatch
modeltype_binary = Variable.ModelType.HotDeck
# Hot deck a continuous variable
# Each model has a set of possible parameters
# that determine what happens in the model
parameters_hd1 = Parameters.HotDeck(
# model_list - a list of variables to match
# donors and recipients
model_list=["var2", "var3", "var5"],
# Donate anything other than the variable
# (i.e. donate together)
# In this case, it's redundant and does nothing...
donate_list=["var_hd1"],
)
var_hd1 is binary and conditional on other variables
var_hd2 is != 0 only if var_hd1 == True
βββββββββββββββ¬βββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββ β Variable β n β n (missing) β mean β std β min β max β βββββββββββββββͺβββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββ‘ β _row_index_ β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β index β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β year β 10,000 β 0 β 2,017.9851 β 1.415937 β 2,016.0 β 2,020.0 β β month β 10,000 β 0 β 6.5137 β 3.432141 β 1.0 β 12.0 β β var2 β 10,000 β 0 β 4.9782 β 3.154508 β 0.0 β 10.0 β β var3 β 10,000 β 0 β 25.1084 β 14.752302 β 0.0 β 50.0 β β var4 β 10,000 β 0 β 0.505666 β 0.287861 β 0.000027 β 0.999997 β β var5 β 10,000 β 0 β 0.4999 β 0.500025 β 0.0 β 1.0 β β var_hd1 β 10,000 β 2,486 β 0.530077 β 0.499128 β 0.0 β 1.0 β β var_hd2 β 10,000 β 2,515 β -2.409894 β 10.378084 β -51.051306 β 26.725014 β βββββββββββββββ΄βββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββ
InΒ [3]:
# Set up the variable to be imputed
logger.info("Impute the boolean variable (var_hd1)")
logger.info(" by setting the model type (a stat match)")
logger.info(" and the list of match variables")
v_hd1 = Variable(
impute_var="var_hd1",
modeltype=Variable.ModelType.StatMatch,
parameters=Parameters.HotDeck(
model_list=["var2", "var3", "var5"]
)
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_hd1)
logger.info("Impute the continuous variable (var_hd2) ")
logger.info(" conditional on var_hd1, using narwhals (nw.col('var_hd1'))")
logger.info(" by setting the model type (a hot deck)")
logger.info(" and the list of match variables")
logger.info(" as well as a post-processing edit to set var_hd2=0 when var_hd1==0")
v_hd2 = Variable(
impute_var="var_hd2",
Where=nw.col("var_hd1"),
By=["year", "month"],
modeltype=Variable.ModelType.HotDeck,
parameters=Parameters.HotDeck(
model_list=["var2", "var3", "var5"]
),
postFunctions=(
nw.when(nw.col("var_hd1"))
.then(nw.col("var_hd2"))
.otherwise(nw.lit(0))
.alias("var_hd2")
)
)
vars_impute.append(v_hd2)
Impute the boolean variable (var_hd1)
by setting the model type (a stat match)
and the list of match variables
Add the variable to the list to be imputed
Impute the continuous variable (var_hd2)
conditional on var_hd1, using narwhals (nw.col('var_hd1'))
by setting the model type (a hot deck)
and the list of match variables
as well as a post-processing edit to set var_hd2=0 when var_hd1==0
InΒ [4]:
logger.info("Set up the imputation")
srmi = SRMI(
df=df,
variables=vars_impute,
n_implicates=2,
n_iterations=1,
parallel=False,
bayesian_bootstrap=True,
parallel_testing=False,
path_model=f"{config.path_temp_files}/py_srmi_test_hd",
force_start=True,
)
Set up the imputation
InΒ [5]:
logger.info("Run it")
srmi.run()
Run it
Variable selection before SRMI run, if necessary
var_hd1: Method.No
var_hd2: Method.No
Hyperparameter tuning before SRMI run, if necessary
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_hd.srmi/1.srmi.implicate
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_hd.srmi/2.srmi.implicate
Imputation using statistical matching
[['var2', 'var3', 'var5'], ['var2', 'var3'], ['var2']]
Matching on: ['var2', 'var3', 'var5']
Matches
obs = 2,483
share = 0.9988
Post-imputation statistics for ['var_hd1']
Where: None
Where (impute): col(___imp_missing_var_hd1_1)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd1 β β 9997 β 9997 β 0.5246 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 0 β 7514 β 7514 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 1 β 2483 β 2483 β 0.5079 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 583 β 5 β β 746 β 5 β β 3561 β 5 β β 415 β 4 β β 951 β 4 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'var3']
Matches
obs = 3
share = 0.0012
Post-imputation statistics for ['var_hd1']
Where: None
Where (impute): col(___imp_missing_var_hd1_1)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd1 β β 7517 β 7517 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 0 β 7514 β 7514 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 1 β 3 β 3 β 0.6667 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (2, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 9586 β 2 β β 9701 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Imputation using hot deck
Matching on: ['var2', 'var3', 'var5', 'year', 'month']
Matches
obs = 124
share = 0.0985
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 4111 β 4111 β -4.328 β -4.438 β 13.86 β -24.58 β -12.52 β -2.066 β 5.95 β 11.36 β -51.05 β 26.73 β β var_hd2 β 0 β 3987 β 3987 β -4.354 β -4.465 β 13.91 β -24.71 β -12.58 β -2.078 β 5.951 β 11.37 β -51.05 β 26.73 β β var_hd2 β 1 β 124 β 124 β -3.483 β -3.57 β 12.22 β -21.59 β -10.1 β -1.894 β 5.919 β 9.335 β -42.77 β 19.35 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 41 β 1 β β 126 β 1 β β 216 β 1 β β 218 β 1 β β 379 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'var3', 'year', 'month']
Matches
obs = 32
share = 0.0254
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 4019 β 4019 β -4.366 β -4.476 β 13.95 β -24.77 β -12.6 β -2.078 β 5.973 β 11.38 β -51.05 β 26.73 β β var_hd2 β 0 β 3987 β 3987 β -4.354 β -4.465 β 13.91 β -24.71 β -12.58 β -2.081 β 5.95 β 11.37 β -51.05 β 26.73 β β var_hd2 β 1 β 32 β 32 β -5.842 β -5.842 β 18.36 β -32.72 β -22.19 β -0.5069 β 8.992 β 12.76 β -47.12 β 18.04 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 1270 β 1 β β 1275 β 1 β β 2073 β 1 β β 2076 β 1 β β 2309 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'year', 'month']
Matches
obs = 1,103
share = 0.8761
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 5090 β 5090 β -4.433 β -4.542 β 13.94 β -24.87 β -12.95 β -2.087 β 5.982 β 11.26 β -51.05 β 26.73 β β var_hd2 β 0 β 3987 β 3987 β -4.354 β -4.465 β 13.91 β -24.71 β -12.58 β -2.081 β 5.95 β 11.37 β -51.05 β 26.73 β β var_hd2 β 1 β 1103 β 1103 β -4.721 β -4.822 β 14.02 β -25.31 β -14.23 β -2.14 β 6.099 β 10.82 β -51.05 β 26.73 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 4307 β 4 β β 6361 β 4 β β 7525 β 4 β β 445 β 3 β β 544 β 3 β ββββββββββββββββββββββ΄ββββββββββ
Updating data according to narwhals expression: when_then(all_horizontal(col(var_hd1), ignore_nulls=False), col(var_hd2), lit(value=0, dtype=None)).alias(name=var_hd2)
var_hd1
var_hd2
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_hd.srmi/1.srmi.implicate
Imputation using statistical matching
[['var2', 'var3', 'var5'], ['var2', 'var3'], ['var2']]
Matching on: ['var2', 'var3', 'var5']
Matches
obs = 2,483
share = 0.9988
Post-imputation statistics for ['var_hd1']
Where: None
Where (impute): col(___imp_missing_var_hd1_1)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd1 β β 9997 β 9997 β 0.526 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 0 β 7514 β 7514 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 1 β 2483 β 2483 β 0.5135 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 746 β 5 β β 7295 β 5 β β 951 β 4 β β 1040 β 4 β β 1332 β 4 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'var3']
Matches
obs = 3
share = 0.0012
Post-imputation statistics for ['var_hd1']
Where: None
Where (impute): col(___imp_missing_var_hd1_1)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd1 β β 7517 β 7517 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 0 β 7514 β 7514 β 0.5301 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β β var_hd1 β 1 β 3 β 3 β 0.6667 β 1 β 0 β 1 β 1 β 1 β 1 β 1 β 1 β 1 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (3, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 4770 β 1 β β 6674 β 1 β β 9733 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Imputation using hot deck
Matching on: ['var2', 'var3', 'var5', 'year', 'month']
Matches
obs = 123
share = 0.0969
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 4114 β 4114 β -4.359 β -4.466 β 13.85 β -24.53 β -12.58 β -2.059 β 5.885 β 11.35 β -51.05 β 26.73 β β var_hd2 β 0 β 3991 β 3991 β -4.381 β -4.489 β 13.9 β -24.7 β -12.66 β -2.078 β 5.891 β 11.36 β -51.05 β 26.73 β β var_hd2 β 1 β 123 β 123 β -3.647 β -3.707 β 12.27 β -21.59 β -10.13 β -1.894 β 5.632 β 9.193 β -42.77 β 19.35 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 41 β 1 β β 126 β 1 β β 216 β 1 β β 218 β 1 β β 379 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'var3', 'year', 'month']
Matches
obs = 32
share = 0.0252
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 4023 β 4023 β -4.393 β -4.5 β 13.94 β -24.77 β -12.75 β -2.065 β 5.942 β 11.37 β -51.05 β 26.73 β β var_hd2 β 0 β 3991 β 3991 β -4.381 β -4.489 β 13.9 β -24.7 β -12.66 β -2.078 β 5.891 β 11.36 β -51.05 β 26.73 β β var_hd2 β 1 β 32 β 32 β -5.842 β -5.842 β 18.36 β -32.72 β -22.19 β -0.5069 β 8.992 β 12.76 β -47.12 β 18.04 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 1270 β 1 β β 1275 β 1 β β 2073 β 1 β β 2076 β 1 β β 2309 β 1 β ββββββββββββββββββββββ΄ββββββββββ
Matching on: ['var2', 'year', 'month']
Matches
obs = 1,114
share = 0.8779
Post-imputation statistics for ['var_hd2']
Where: col(var_hd1)
Where (impute): col(___imp_missing_var_hd2_2)
ββββββββββββ¬ββββββββββ¬βββββββ¬βββββββββββββββ¬βββββββββ¬βββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ¬ββββββββββββββ β Variable β Imputed β n β n (not null) β mean β mean (not 0) β std (not 0) β q10 (not 0) β q25 (not 0) β q50 (not 0) β q75 (not 0) β q90 (not 0) β min (not 0) β max (not 0) β ββββββββββββͺββββββββββͺβββββββͺβββββββββββββββͺβββββββββͺβββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββͺββββββββββββββ‘ β var_hd2 β β 5105 β 5105 β -4.388 β -4.496 β 13.96 β -24.87 β -12.86 β -2.081 β 6.094 β 11.41 β -51.05 β 26.73 β β var_hd2 β 0 β 3991 β 3991 β -4.381 β -4.489 β 13.9 β -24.7 β -12.66 β -2.078 β 5.891 β 11.36 β -51.05 β 26.73 β β var_hd2 β 1 β 1114 β 1114 β -4.413 β -4.518 β 14.17 β -25.31 β -13.68 β -2.148 β 6.544 β 11.67 β -51.05 β 24.49 β ββββββββββββ΄ββββββββββ΄βββββββ΄βββββββββββββββ΄βββββββββ΄βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
Most common matches:
shape: (5, 2) ββββββββββββββββββββββ¬ββββββββββ β donor____rownumber β nDonors β β --- β --- β β i16 β i8 β ββββββββββββββββββββββͺββββββββββ‘ β 6165 β 5 β β 8867 β 4 β β 1462 β 3 β β 1550 β 3 β β 2488 β 3 β ββββββββββββββββββββββ΄ββββββββββ
Updating data according to narwhals expression: when_then(all_horizontal(col(var_hd1), ignore_nulls=False), col(var_hd2), lit(value=0, dtype=None)).alias(name=var_hd2)
var_hd1
var_hd2
Final Estimates by Iteration
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_hd.srmi/2.srmi.implicate
InΒ [6]:
logger.info("Get the results")
_ = df_list = srmi.df_implicates
Get the results
InΒ [7]:
logger.info("\n\nLook at the original")
_ = summary(df_original)
logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary)
logger.info("\n\nLook at the imputes | var_hd1 == 0")
_ = df_list.filter(~nw.col("var_hd1")).pipe(summary)
logger.info("\n\nLook at the imputes | var_hd1 == 1")
_ = df_list.filter(nw.col("var_hd1")).pipe(summary)
Look at the original
βββββββββββββββ¬βββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββββ¬βββββββββββββ¬βββββββββββ β Variable β n β n (missing) β mean β std β min β max β βββββββββββββββͺβββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββββͺβββββββββββββͺβββββββββββ‘ β _row_index_ β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β index β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β year β 10,000 β 0 β 2,017.9851 β 1.415937 β 2,016.0 β 2,020.0 β β month β 10,000 β 0 β 6.5137 β 3.432141 β 1.0 β 12.0 β β var2 β 10,000 β 0 β 4.9782 β 3.154508 β 0.0 β 10.0 β β var3 β 10,000 β 0 β 25.1084 β 14.752302 β 0.0 β 50.0 β β var4 β 10,000 β 0 β 0.505666 β 0.287861 β 0.000027 β 0.999997 β β var5 β 10,000 β 0 β 0.4999 β 0.500025 β 0.0 β 1.0 β β missing_hd1 β 10,000 β 0 β 0.499985 β 0.288921 β 0.000129 β 0.99994 β β missing_hd2 β 10,000 β 0 β 0.499133 β 0.288946 β 0.000133 β 0.999972 β β var_hd1 β 10,000 β 0 β 0.523 β 0.499496 β 0.0 β 1.0 β β var_hd2 β 10,000 β 0 β -2.416816 β 10.339524 β -53.647677 β 27.06033 β βββββββββββββββ΄βββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββββ΄βββββββββββββ΄βββββββββββ
Look at the imputes
ββββββββββββββββ¬βββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺβββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββ‘ β ___rownumber β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β _row_index_ β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β index β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β year β 10,000 β 0 β 2,017.9851 β 1.415937 β 2,016.0 β 2,020.0 β β month β 10,000 β 0 β 6.5137 β 3.432141 β 1.0 β 12.0 β β var2 β 10,000 β 0 β 4.9782 β 3.154508 β 0.0 β 10.0 β β var3 β 10,000 β 0 β 25.1084 β 14.752302 β 0.0 β 50.0 β β var4 β 10,000 β 0 β 0.505666 β 0.287861 β 0.000027 β 0.999997 β β var5 β 10,000 β 0 β 0.4999 β 0.500025 β 0.0 β 1.0 β β var_hd1 β 10,000 β 0 β 0.5246 β 0.499419 β 0.0 β 1.0 β β var_hd2 β 10,000 β 0 β -2.318508 β 10.219462 β -51.051306 β 26.725014 β ββββββββββββββββ΄βββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββ
ββββββββββββββββ¬βββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββββ¬βββββββββββββ¬ββββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺβββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββββͺβββββββββββββͺββββββββββββ‘ β ___rownumber β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β _row_index_ β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β index β 10,000 β 0 β 4,999.5 β 2,886.89568 β 0.0 β 9,999.0 β β year β 10,000 β 0 β 2,017.9851 β 1.415937 β 2,016.0 β 2,020.0 β β month β 10,000 β 0 β 6.5137 β 3.432141 β 1.0 β 12.0 β β var2 β 10,000 β 0 β 4.9782 β 3.154508 β 0.0 β 10.0 β β var3 β 10,000 β 0 β 25.1084 β 14.752302 β 0.0 β 50.0 β β var4 β 10,000 β 0 β 0.505666 β 0.287861 β 0.000027 β 0.999997 β β var5 β 10,000 β 0 β 0.4999 β 0.500025 β 0.0 β 1.0 β β var_hd1 β 10,000 β 0 β 0.526 β 0.499349 β 0.0 β 1.0 β β var_hd2 β 10,000 β 0 β -2.303672 β 10.243192 β -51.051306 β 26.725014 β ββββββββββββββββ΄βββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββββ΄βββββββββββββ΄ββββββββββββ
Look at the imputes | var_hd1 == 0
ββββββββββββββββ¬ββββββββ¬ββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββ¬βββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺββββββββͺββββββββββββββͺβββββββββββββββͺβββββββββββββββͺβββββββββββͺβββββββββββ‘ β ___rownumber β 4,754 β 0 β 5,041.065839 β 2,874.309024 β 1.0 β 9,997.0 β β _row_index_ β 4,754 β 0 β 5,041.065839 β 2,874.309024 β 1.0 β 9,997.0 β β index β 4,754 β 0 β 5,041.065839 β 2,874.309024 β 1.0 β 9,997.0 β β year β 4,754 β 0 β 2,017.984013 β 1.420803 β 2,016.0 β 2,020.0 β β month β 4,754 β 0 β 6.564367 β 3.414207 β 1.0 β 12.0 β β var2 β 4,754 β 0 β 4.490745 β 3.246844 β 0.0 β 10.0 β β var3 β 4,754 β 0 β 24.912495 β 12.861361 β 0.0 β 50.0 β β var4 β 4,754 β 0 β 0.506516 β 0.289372 β 0.000027 β 0.999997 β β var5 β 4,754 β 0 β 0.877787 β 0.327566 β 0.0 β 1.0 β β var_hd1 β 4,754 β 0 β 0.0 β 0.0 β 0.0 β 0.0 β β var_hd2 β 4,754 β 0 β 0.0 β 0.0 β 0.0 β 0.0 β ββββββββββββββββ΄ββββββββ΄ββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββ΄βββββββββββ
ββββββββββββββββ¬ββββββββ¬ββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββ¬βββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺββββββββͺββββββββββββββͺβββββββββββββββͺβββββββββββββββͺβββββββββββͺβββββββββββ‘ β ___rownumber β 4,740 β 0 β 5,043.214557 β 2,878.670922 β 1.0 β 9,997.0 β β _row_index_ β 4,740 β 0 β 5,043.214557 β 2,878.670922 β 1.0 β 9,997.0 β β index β 4,740 β 0 β 5,043.214557 β 2,878.670922 β 1.0 β 9,997.0 β β year β 4,740 β 0 β 2,017.989241 β 1.418419 β 2,016.0 β 2,020.0 β β month β 4,740 β 0 β 6.540717 β 3.413945 β 1.0 β 12.0 β β var2 β 4,740 β 0 β 4.505696 β 3.244384 β 0.0 β 10.0 β β var3 β 4,740 β 0 β 24.888608 β 12.864506 β 0.0 β 50.0 β β var4 β 4,740 β 0 β 0.507112 β 0.289628 β 0.000027 β 0.999997 β β var5 β 4,740 β 0 β 0.882068 β 0.322562 β 0.0 β 1.0 β β var_hd1 β 4,740 β 0 β 0.0 β 0.0 β 0.0 β 0.0 β β var_hd2 β 4,740 β 0 β 0.0 β 0.0 β 0.0 β 0.0 β ββββββββββββββββ΄ββββββββ΄ββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββ΄βββββββββββ
Look at the imputes | var_hd1 == 1
ββββββββββββββββ¬ββββββββ¬ββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββββ¬ββββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺββββββββͺββββββββββββββͺβββββββββββββββͺβββββββββββββββͺβββββββββββββͺββββββββββββ‘ β ___rownumber β 5,246 β 0 β 4,961.832444 β 2,898.013585 β 0.0 β 9,999.0 β β _row_index_ β 5,246 β 0 β 4,961.832444 β 2,898.013585 β 0.0 β 9,999.0 β β index β 5,246 β 0 β 4,961.832444 β 2,898.013585 β 0.0 β 9,999.0 β β year β 5,246 β 0 β 2,017.986085 β 1.411649 β 2,016.0 β 2,020.0 β β month β 5,246 β 0 β 6.467785 β 3.447995 β 1.0 β 12.0 β β var2 β 5,246 β 0 β 5.419939 β 3.001108 β 0.0 β 10.0 β β var3 β 5,246 β 0 β 25.285932 β 16.276467 β 0.0 β 50.0 β β var4 β 5,246 β 0 β 0.504895 β 0.28651 β 0.000104 β 0.999885 β β var5 β 5,246 β 0 β 0.157453 β 0.364262 β 0.0 β 1.0 β β var_hd1 β 5,246 β 0 β 1.0 β 0.0 β 1.0 β 1.0 β β var_hd2 β 5,246 β 0 β -4.419574 β 13.777174 β -51.051306 β 26.725014 β ββββββββββββββββ΄ββββββββ΄ββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββββ΄ββββββββββββ
ββββββββββββββββ¬ββββββββ¬ββββββββββββββ¬βββββββββββββββ¬βββββββββββββββ¬βββββββββββββ¬ββββββββββββ β Variable β n β n (missing) β mean β std β min β max β ββββββββββββββββͺββββββββͺββββββββββββββͺβββββββββββββββͺβββββββββββββββͺβββββββββββββͺββββββββββββ‘ β ___rownumber β 5,260 β 0 β 4,960.107034 β 2,893.995247 β 0.0 β 9,999.0 β β _row_index_ β 5,260 β 0 β 4,960.107034 β 2,893.995247 β 0.0 β 9,999.0 β β index β 5,260 β 0 β 4,960.107034 β 2,893.995247 β 0.0 β 9,999.0 β β year β 5,260 β 0 β 2,017.981369 β 1.413822 β 2,016.0 β 2,020.0 β β month β 5,260 β 0 β 6.489354 β 3.448599 β 1.0 β 12.0 β β var2 β 5,260 β 0 β 5.403992 β 3.008656 β 0.0 β 10.0 β β var3 β 5,260 β 0 β 25.306464 β 16.265575 β 0.0 β 50.0 β β var4 β 5,260 β 0 β 0.504362 β 0.286281 β 0.000104 β 0.999885 β β var5 β 5,260 β 0 β 0.155513 β 0.362428 β 0.0 β 1.0 β β var_hd1 β 5,260 β 0 β 1.0 β 0.0 β 1.0 β 1.0 β β var_hd2 β 5,260 β 0 β -4.379605 β 13.798472 β -51.051306 β 26.725014 β ββββββββββββββββ΄ββββββββ΄ββββββββββββββ΄βββββββββββββββ΄βββββββββββββββ΄βββββββββββββ΄ββββββββββββ