In [1]:
from __future__ import annotations
import polars as pl
from survey_kit.utilities.dataframe import summary, join_wrapper
from survey_kit.imputation.srmi import SRMI
from survey_kit import logger, config
from survey_kit.statistics.multiple_imputation import mi_ses_from_function
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.replicates import Replicates
from survey_kit.utilities.dataframe import safe_height
from survey_kit.statistics.bootstrap import bayes_bootstrap
In [2]:
logger.info("Load the data from the SRMI tutorial")
path_model = f"{config.path_temp_files}/py_srmi_test_gbm"
srmi = SRMI.load(path_model)
df_implicates = srmi.df_implicates
Load the data from the SRMI tutorial
In [3]:
logger.info("Make random weights for the bootstrap")
n_rows = safe_height(df_implicates[0])
n_replicates = 10
df_weights = (
bayes_bootstrap(
n_rows=n_rows,
n_draws=n_replicates+1,
seed=8345,
prefix="weight_",
initial_weight_index=0,
)
.with_row_index("index")
)
Make random weights for the bootstrap
In [4]:
logger.info("What's the data look like")
logger.info(" The SRMI data - both implicates")
_ = df_implicates.pipe(summary)
logger.info(" The weights")
_ = summary(df_weights)
What's the data look like
The SRMI data - both implicates
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ var_gbm3 ┆ 10,000 ┆ 1,212 ┆ -2.721713 ┆ 10.901329 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm2 ┆ 10,000 ┆ 0 ┆ -2.361046 ┆ 10.164599 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5278 ┆ 0.499252 ┆ 0.0 ┆ 1.0 │ │ var_gbm12 ┆ 10,000 ┆ 0 ┆ -2.361046 ┆ 10.164599 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm2_sq ┆ 10,000 ┆ 0 ┆ 108.883286 ┆ 269.40042 ┆ 0.0 ┆ 3,064.077326 │ └─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘
┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬──────────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪══════════════╡ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ _row_index_ ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ year ┆ 10,000 ┆ 0 ┆ 2,017.9851 ┆ 1.415937 ┆ 2,016.0 ┆ 2,020.0 │ │ month ┆ 10,000 ┆ 0 ┆ 6.5137 ┆ 3.432141 ┆ 1.0 ┆ 12.0 │ │ var2 ┆ 10,000 ┆ 0 ┆ 4.9782 ┆ 3.154508 ┆ 0.0 ┆ 10.0 │ │ var3 ┆ 10,000 ┆ 0 ┆ 25.1084 ┆ 14.752302 ┆ 0.0 ┆ 50.0 │ │ var4 ┆ 10,000 ┆ 0 ┆ 0.505666 ┆ 0.287861 ┆ 0.000027 ┆ 0.999997 │ │ var5 ┆ 10,000 ┆ 0 ┆ 0.4999 ┆ 0.500025 ┆ 0.0 ┆ 1.0 │ │ unrelated_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ unrelated_2 ┆ 10,000 ┆ 0 ┆ 0.500105 ┆ 0.287638 ┆ 0.000049 ┆ 0.999539 │ │ unrelated_3 ┆ 10,000 ┆ 0 ┆ 0.499175 ┆ 0.28876 ┆ 0.000129 ┆ 0.99994 │ │ unrelated_4 ┆ 10,000 ┆ 0 ┆ 0.500655 ┆ 0.288698 ┆ 0.000133 ┆ 0.999972 │ │ unrelated_5 ┆ 10,000 ┆ 0 ┆ 0.49876 ┆ 0.288979 ┆ 0.000071 ┆ 0.999867 │ │ repeat_1 ┆ 10,000 ┆ 0 ┆ 0.502449 ┆ 0.288359 ┆ 0.000119 ┆ 0.999997 │ │ var_gbm3 ┆ 10,000 ┆ 1,219 ┆ -2.692408 ┆ 10.886586 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm2 ┆ 10,000 ┆ 0 ┆ -2.299835 ┆ 10.146886 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm1 ┆ 10,000 ┆ 0 ┆ 0.5284 ┆ 0.499218 ┆ 0.0 ┆ 1.0 │ │ var_gbm12 ┆ 10,000 ┆ 0 ┆ -2.299835 ┆ 10.146886 ┆ -55.354108 ┆ 26.213084 │ │ var_gbm2_sq ┆ 10,000 ┆ 0 ┆ 108.238244 ┆ 271.231778 ┆ 0.0 ┆ 3,064.077326 │ └─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴──────────────┘
The weights
┌───────────┬────────┬─────────────┬──────────┬─────────────┬──────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═══════════╪════════╪═════════════╪══════════╪═════════════╪══════════╪═══════════╡ │ index ┆ 10,000 ┆ 0 ┆ 4,999.5 ┆ 2,886.89568 ┆ 0.0 ┆ 9,999.0 │ │ weight_0 ┆ 10,000 ┆ 0 ┆ 0.996552 ┆ 0.988544 ┆ 0.000027 ┆ 9.548572 │ │ weight_1 ┆ 10,000 ┆ 0 ┆ 0.994871 ┆ 1.013359 ┆ 0.000087 ┆ 9.40968 │ │ weight_2 ┆ 10,000 ┆ 0 ┆ 1.013333 ┆ 0.990934 ┆ 0.000017 ┆ 10.246139 │ │ weight_3 ┆ 10,000 ┆ 0 ┆ 1.015993 ┆ 1.017586 ┆ 0.000047 ┆ 10.149365 │ │ weight_4 ┆ 10,000 ┆ 0 ┆ 1.000038 ┆ 1.003878 ┆ 0.00001 ┆ 8.684298 │ │ weight_5 ┆ 10,000 ┆ 0 ┆ 1.015135 ┆ 1.023968 ┆ 0.000229 ┆ 9.941725 │ │ weight_6 ┆ 10,000 ┆ 0 ┆ 1.001943 ┆ 0.990688 ┆ 0.000108 ┆ 9.909087 │ │ weight_7 ┆ 10,000 ┆ 0 ┆ 1.003338 ┆ 0.992315 ┆ 0.00015 ┆ 10.318354 │ │ weight_8 ┆ 10,000 ┆ 0 ┆ 1.000687 ┆ 1.001554 ┆ 0.000318 ┆ 11.09299 │ │ weight_9 ┆ 10,000 ┆ 0 ┆ 0.991478 ┆ 1.005367 ┆ 0.000052 ┆ 8.802607 │ │ weight_10 ┆ 10,000 ┆ 0 ┆ 0.983393 ┆ 0.980691 ┆ 0.000005 ┆ 8.388088 │ └───────────┴────────┴─────────────┴──────────┴─────────────┴──────────┴───────────┘
In [5]:
logger.info("To run an arbitrary stat, you need")
logger.info(" any function that takes a dataframe and a weight and returns a dataframe")
logger.info("The function will be run for each boostrap/replicate weight")
logger.info(" Note: you may need to import packages within the function")
logger.info(" if executed in parallel (it will get pickled and reloaded)")
def run_regression(df:pl.LazyFrame | pl.DataFrame,
y:str,
X:list[str],
weight:str="") -> pl.LazyFrame | pl.DataFrame:
import polars as pl
from sklearn.linear_model import LinearRegression
df = df.lazy().collect()
model = LinearRegression()
d_weight = {}
if weight != "":
d_weight["sample_weight"] = df[weight]
model.fit(
X=df.select(X),
y=df.select(y),
**d_weight
)
df_betas = pl.DataFrame(
dict(
Variable=df.select(X).lazy().collect_schema().names() + ["_Intercept_"],
Beta=[
float(vali)
for vali in [float(coefi) for coefi in model.coef_[0]] + [float(model.intercept_[0])]
],
)
)
return df_betas
To run an arbitrary stat, you need
any function that takes a dataframe and a weight and returns a dataframe
The function will be run for each boostrap/replicate weight
Note: you may need to import packages within the function
if executed in parallel (it will get pickled and reloaded)
In [6]:
logger.info("The betas from the regression (no weights, implicate 0)")
y = "var_gbm2"
X = ["var2","var3","var4","var5"]
df_betas = run_regression(df=df_implicates[0],y=y,X=X)
logger.info(df_betas)
The betas from the regression (no weights, implicate 0)
shape: (5, 2) ┌─────────────┬────────────┐ │ Variable ┆ Beta │ │ --- ┆ --- │ │ str ┆ f64 │ ╞═════════════╪════════════╡ │ var2 ┆ 0.59185 │ │ var3 ┆ -0.328883 │ │ var4 ┆ -12.990215 │ │ var5 ┆ 2.933262 │ │ _Intercept_ ┆ 8.052709 │ └─────────────┴────────────┘
In [7]:
logger.info("Confirming how to run it on one implicate")
arguments_sc = dict(
y=y,
X=X,
)
replicates = Replicates(
weight_stub="weight_",
n_replicates=n_replicates,
bootstrap=True
)
sc_compare = StatCalculator.from_function(
run_regression,
df=join_wrapper(df_implicates[0],df_weights,on=["index"],how="left"),
estimate_ids=["Variable"],
arguments=arguments_sc,
replicates=replicates,
display=False
)
sc_compare.print()
Confirming how to run it on one implicate
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ var2 ┆ 0.588633 │ │ ┆ 0.07485 │ │ var3 ┆ -0.329186 │ │ ┆ 0.00812 │ │ var4 ┆ -13.066947 │ │ ┆ 0.646009 │ │ var5 ┆ 2.741127 │ │ ┆ 0.637431 │ │ _Intercept_ ┆ 8.178235 │ │ ┆ 0.660875 │ └─────────────┴────────────┘
In [8]:
logger.info("Now the full function")
logger.info(" Note the nested arguments...")
logger.info(" Maybe not the best API, but it does mean")
logger.info(" that you can run ANYTHING that takes in a")
logger.info(" dataframe and a weight and returns a dataframe of estiamtes")
logger.info(" and run it through the bootstrap + MI process")
arguments_sc = dict(
y=y,
X=X,
)
arguments_mi = dict(
delegate=run_regression,
arguments=arguments_sc,
estimate_ids=["Variable"],
replicates=replicates,
)
mi_results_seq = mi_ses_from_function(
delegate=StatCalculator.from_function,
df_implicates=df_implicates,
path_srmi=path_model,
df_noimputes=df_weights,
index=["index"],
arguments=arguments_mi,
join_on=["Variable"],
parallel=False,
)
mi_results_seq.print()
Now the full function
Note the nested arguments...
Maybe not the best API, but it does mean
that you can run ANYTHING that takes in a
dataframe and a weight and returns a dataframe of estiamtes
and run it through the bootstrap + MI process
Implicate #1
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ var2 ┆ 0.588633 │ │ ┆ 0.07485 │ │ var3 ┆ -0.329186 │ │ ┆ 0.00812 │ │ var4 ┆ -13.066947 │ │ ┆ 0.646009 │ │ var5 ┆ 2.741127 │ │ ┆ 0.637431 │ │ _Intercept_ ┆ 8.178235 │ │ ┆ 0.660875 │ └─────────────┴────────────┘
Implicate #2
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
┌─────────────┬───────────┐ │ Variable ┆ Beta │ ╞═════════════╪═══════════╡ │ var2 ┆ 0.58445 │ │ ┆ 0.07622 │ │ var3 ┆ -0.328438 │ │ ┆ 0.013036 │ │ var4 ┆ -13.10739 │ │ ┆ 0.57152 │ │ var5 ┆ 2.565298 │ │ ┆ 0.87618 │ │ _Intercept_ ┆ 8.398606 │ │ ┆ 0.988309 │ └─────────────┴───────────┘
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ var2 ┆ 0.586542 │ │ ┆ 0.075625 │ │ var3 ┆ -0.328812 │ │ ┆ 0.010879 │ │ var4 ┆ -13.087169 │ │ ┆ 0.610907 │ │ var5 ┆ 2.653212 │ │ ┆ 0.781148 │ │ _Intercept_ ┆ 8.28842 │ │ ┆ 0.862078 │ └─────────────┴────────────┘
In [9]:
logger.info("For parallel, it's better to pass the path of the model")
logger.info(" To avoid unnecessary I/O")
logger.info(" (For sequential it doesn't really matter)")
mi_results = mi_ses_from_function(
delegate=StatCalculator.from_function,
# df_implicates=df_implicates,
path_srmi=path_model,
df_noimputes=df_weights,
index=["index"],
arguments=arguments_mi,
join_on=["Variable"],
parallel=True,
)
For parallel, it's better to pass the path of the model
To avoid unnecessary I/O
(For sequential it doesn't really matter)
Defaulting to shell for parallel estimation
Starting call:
Starting call at 0 seconds
Because: RunBecause.SetToRun
Function: _mi_ses_from_function_one_implicate
language: Languages.Python
BEGIN CODE
from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8')
_mi_ses_from_function_one_implicate(implicate_number=0,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8_implicate_0',**d_implicate)
END CODE
Starting call at 0 seconds
Because: RunBecause.SetToRun
Function: _mi_ses_from_function_one_implicate
language: Languages.Python
BEGIN CODE
from survey_kit.orchestration.config import Config
Config().cpus = 1
from survey_kit.utilities.logging import set_logging, PrintLogger
set_logging(path_log='.',force=True,to_console=False,append_to_file=True,name='.')
capture_prints_log = PrintLogger('.',to_console=False)
from survey_kit.statistics.multiple_imputation import _mi_ses_from_function_one_implicate
from survey_kit.serializable import SerializableDictionary
d_implicate = SerializableDictionary.load(path='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8')
_mi_ses_from_function_one_implicate(implicate_number=1,path_save='C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8_implicate_1',**d_implicate)
END CODE
.
.
.
.
.
.
.
.
.
.
.
.
LOGGING INFORMATION FOR:
FILE: C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files\001__mi_ses_from_function_one_implicate_q5osm_1p.py
PRINT OUTPUT BEGIN
PRINT OUTPUT END
LOGGING INFORMATION FOR:
FILE: C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files\002__mi_ses_from_function_one_implicate_97o5gvm9.py
PRINT OUTPUT BEGIN
PRINT OUTPUT END
Complete=True
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8_implicate_0.replicate_stats
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8_implicate_1.replicate_stats
Removing existing directory C:/Users/jonro/OneDrive/Documents/Coding/survey_kit/.scratch/temp_files/pmcvruu8.dict
In [10]:
logger.info("The results")
mi_results.print()
mi_results_seq.print()
The results
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ var2 ┆ 0.586542 │ │ ┆ 0.075625 │ │ var3 ┆ -0.328812 │ │ ┆ 0.010879 │ │ var4 ┆ -13.087169 │ │ ┆ 0.610907 │ │ var5 ┆ 2.653212 │ │ ┆ 0.781148 │ │ _Intercept_ ┆ 8.28842 │ │ ┆ 0.862078 │ └─────────────┴────────────┘
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ var2 ┆ 0.586542 │ │ ┆ 0.075625 │ │ var3 ┆ -0.328812 │ │ ┆ 0.010879 │ │ var4 ┆ -13.087169 │ │ ┆ 0.610907 │ │ var5 ┆ 2.653212 │ │ ┆ 0.781148 │ │ _Intercept_ ┆ 8.28842 │ │ ┆ 0.862078 │ └─────────────┴────────────┘
In [11]:
logger.info("Compare two sets of results")
d_comparison = mi_results.compare(mi_results_seq)
d_comparison["difference"].print()
d_comparison["ratio"].print()
Compare two sets of results
┌─────────────┬──────┐ │ Variable ┆ Beta │ ╞═════════════╪══════╡ │ var2 ┆ 0.0 │ │ ┆ 0.0 │ │ var3 ┆ 0.0 │ │ ┆ 0.0 │ │ var4 ┆ 0.0 │ │ ┆ 0.0 │ │ var5 ┆ 0.0 │ │ ┆ 0.0 │ │ _Intercept_ ┆ 0.0 │ │ ┆ 0.0 │ └─────────────┴──────┘
┌─────────────┬──────┐ │ Variable ┆ Beta │ ╞═════════════╪══════╡ │ var2 ┆ 0.0 │ │ ┆ 0.0 │ │ var3 ┆ 0.0 │ │ ┆ 0.0 │ │ var4 ┆ 0.0 │ │ ┆ 0.0 │ │ var5 ┆ 0.0 │ │ ┆ 0.0 │ │ _Intercept_ ┆ 0.0 │ │ ┆ 0.0 │ └─────────────┴──────┘