In [1]:
from __future__ import annotations
import polars as pl
from sklearn.linear_model import LinearRegression
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.replicates import Replicates
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed
from survey_kit import logger
In [2]:
# Draw some random data
def gen_random_table(n_rows: int, n_replicates: int, seed: int):
set_seed(seed)
df = (
RandomData(n_rows=n_rows, seed=generate_seed())
.index("index")
.integer("v_1", 0, 10)
.float("v_2", 0, 100)
.float("v_3", 0, 100)
.float("v_4", 0, 100)
.np_distribution("e", "normal", loc=0, scale=100)
)
df = df.to_df()
c_v1 = pl.col("v_1")
c_v2 = pl.col("v_2")
c_v3 = pl.col("v_3")
c_v4 = pl.col("v_4")
e = pl.col("e")
df = df.with_columns(
y=10*c_v1-5*c_v2+100*c_v3+c_v4 + e
)
df = (
pl.concat(
[
df,
bayes_bootstrap(
n_rows=n_rows,
n_draws=n_replicates+1,
seed=generate_seed(),
initial_weight_index=0,
prefix="weight_"
)
],
how="horizontal"
)
.lazy()
)
return df
In [3]:
n_rows = 1_000
n_replicates = 10
logger.info("Create two datasets to compare, with replicate weights (or bootstrap weights)")
df = gen_random_table(n_rows, n_replicates, seed=1230)
df_compare = gen_random_table(n_rows, n_replicates, seed=3254)
logger.info("The data")
_ = summary(df)
Create two datasets to compare, with replicate weights (or bootstrap weights)
The data
┌───────────┬───────┬─────────────┬──────────────┬──────────────┬─────────────┬───────────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞═══════════╪═══════╪═════════════╪══════════════╪══════════════╪═════════════╪═══════════════╡ │ index ┆ 1,000 ┆ 0 ┆ 499.5 ┆ 288.819436 ┆ 0.0 ┆ 999.0 │ │ v_1 ┆ 1,000 ┆ 0 ┆ 5.128 ┆ 3.17832 ┆ 0.0 ┆ 10.0 │ │ v_2 ┆ 1,000 ┆ 0 ┆ 52.096715 ┆ 29.123653 ┆ 0.0944 ┆ 99.769011 │ │ v_3 ┆ 1,000 ┆ 0 ┆ 50.852027 ┆ 28.367909 ┆ 0.060494 ┆ 99.948326 │ │ v_4 ┆ 1,000 ┆ 0 ┆ 51.103943 ┆ 28.838153 ┆ 0.101189 ┆ 99.570176 │ │ e ┆ 1,000 ┆ 0 ┆ 2.416973 ┆ 99.344919 ┆ -297.765179 ┆ 353.500711 │ │ y ┆ 1,000 ┆ 0 ┆ 4,929.520027 ┆ 2,846.548527 ┆ -388.726059 ┆ 10,027.774643 │ │ weight_0 ┆ 1,000 ┆ 0 ┆ 0.990942 ┆ 0.959858 ┆ 0.000929 ┆ 5.832272 │ │ weight_1 ┆ 1,000 ┆ 0 ┆ 0.979466 ┆ 1.031472 ┆ 0.000465 ┆ 8.610054 │ │ weight_2 ┆ 1,000 ┆ 0 ┆ 1.000521 ┆ 0.982858 ┆ 0.000443 ┆ 6.123822 │ │ weight_3 ┆ 1,000 ┆ 0 ┆ 0.97896 ┆ 0.993243 ┆ 0.001089 ┆ 8.561238 │ │ weight_4 ┆ 1,000 ┆ 0 ┆ 0.999087 ┆ 0.954473 ┆ 0.00275 ┆ 7.249914 │ │ weight_5 ┆ 1,000 ┆ 0 ┆ 1.005718 ┆ 0.977519 ┆ 0.00222 ┆ 6.85744 │ │ weight_6 ┆ 1,000 ┆ 0 ┆ 1.002752 ┆ 1.014994 ┆ 0.001331 ┆ 7.676241 │ │ weight_7 ┆ 1,000 ┆ 0 ┆ 1.004118 ┆ 0.96822 ┆ 0.000878 ┆ 7.03514 │ │ weight_8 ┆ 1,000 ┆ 0 ┆ 1.006493 ┆ 0.943742 ┆ 0.000563 ┆ 5.516431 │ │ weight_9 ┆ 1,000 ┆ 0 ┆ 0.996047 ┆ 0.997422 ┆ 0.001608 ┆ 6.8256 │ │ weight_10 ┆ 1,000 ┆ 0 ┆ 1.059505 ┆ 1.071869 ┆ 0.000018 ┆ 7.749995 │ └───────────┴───────┴─────────────┴──────────────┴──────────────┴─────────────┴───────────────┘
In [4]:
logger.info("To run an arbitrary stat, you need")
logger.info(" any function that takes a dataframe and a weight and returns a dataframe")
logger.info("The function will be run for each boostrap/replicate weight")
def run_regression(df:pl.LazyFrame | pl.DataFrame,
y:str,
X:list[str],
weight:str) -> pl.LazyFrame | pl.DataFrame:
df = df.lazy().collect()
model = LinearRegression()
model.fit(
X=df.select(X),
y=df.select(y),
sample_weight=df[weight]
)
df_betas = pl.DataFrame(
dict(
Variable=df.select(X).lazy().collect_schema().names() + ["_Intercept_"],
Beta=[
float(vali)
for vali in [float(coefi) for coefi in model.coef_[0]] + [float(model.intercept_[0])]
],
)
)
return df_betas
To run an arbitrary stat, you need
any function that takes a dataframe and a weight and returns a dataframe
The function will be run for each boostrap/replicate weight
In [5]:
logger.info("The betas from the regression")
df_betas = run_regression(df=df,y="y",X=columns_from_list(df,["v_*"]),weight="weight_0")
logger.info(df_betas)
logger.info("Call StatCalculator.from_function")
logger.info(" Pass in the data")
logger.info(" 'estimate_ids': the name of the column(s) that index the estimates")
logger.info(" 'arguments': stable (non df and non-weight) arguments")
logger.info(" 'replicates': weights to loop over")
sc = StatCalculator.from_function(
run_regression,
df=df,
estimate_ids=["Variable"],
arguments=dict(
y="y",
X=columns_from_list(df,["v_*"])
),
replicates=Replicates(
weight_stub="weight_",
n_replicates=n_replicates,
bootstrap=True
),
display=False
)
logger.info("The result")
sc.print()
The betas from the regression
shape: (5, 2) ┌─────────────┬────────────┐ │ Variable ┆ Beta │ │ --- ┆ --- │ │ str ┆ f64 │ ╞═════════════╪════════════╡ │ v_1 ┆ 10.673025 │ │ v_2 ┆ -5.061185 │ │ v_3 ┆ 100.09039 │ │ v_4 ┆ 1.081332 │ │ _Intercept_ ┆ -14.280409 │ └─────────────┴────────────┘
Call StatCalculator.from_function
Pass in the data
'estimate_ids': the name of the column(s) that index the estimates
'arguments': stable (non df and non-weight) arguments
'replicates': weights to loop over
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
The result
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ v_1 ┆ 10.673025 │ │ ┆ 1.639466 │ │ v_2 ┆ -5.061185 │ │ ┆ 0.193846 │ │ v_3 ┆ 100.09039 │ │ ┆ 0.263585 │ │ v_4 ┆ 1.081332 │ │ ┆ 0.186253 │ │ _Intercept_ ┆ -14.280409 │ │ ┆ 35.065297 │ └─────────────┴────────────┘
In [6]:
logger.info("You can compare it like any other stat, run the same regression on another sample")
logger.info(" Run the regression on a different random draw")
sc_compare = StatCalculator.from_function(
run_regression,
df=df_compare,
estimate_ids=["Variable"],
arguments=dict(
y="y",
X=columns_from_list(df,["v_*"])
),
replicates=Replicates(
weight_stub="weight_",
n_replicates=n_replicates,
bootstrap=True
),
display=False
)
You can compare it like any other stat, run the same regression on another sample
Run the regression on a different random draw
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
In [7]:
logger.info(" And compare the results")
d_comp = sc.compare(sc_compare)
And compare the results
Comparing estimates using bootstrap weights
Difference
┌─────────────┬────────────┐ │ Variable ┆ Beta │ ╞═════════════╪════════════╡ │ v_1 ┆ 0.834108 │ │ ┆ 1.395571 │ │ v_2 ┆ 0.123172 │ │ ┆ 0.122854 │ │ v_3 ┆ -0.016832 │ │ ┆ 0.167508 │ │ v_4 ┆ 0.05095 │ │ ┆ 0.114858 │ │ _Intercept_ ┆ -10.521408 │ │ ┆ 19.993532 │ └─────────────┴────────────┘
Ratio
┌─────────────┬───────────┐ │ Variable ┆ Beta │ ╞═════════════╪═══════════╡ │ v_1 ┆ 0.078151 │ │ ┆ 0.133234 │ │ v_2 ┆ -0.024337 │ │ ┆ 0.024386 │ │ v_3 ┆ -0.000168 │ │ ┆ 0.001675 │ │ v_4 ┆ 0.047118 │ │ ┆ 0.111966 │ │ _Intercept_ ┆ 0.736772 │ │ ┆ 9.945254 │ └─────────────┴───────────┘