In [1]:
from __future__ import annotations

import polars as pl
from sklearn.linear_model import LinearRegression

from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.replicates import Replicates
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed

from survey_kit import logger
In [2]:
# Draw some random data
def gen_random_table(n_rows: int, n_replicates: int, seed: int):
    set_seed(seed)
    df = (
        RandomData(n_rows=n_rows, seed=generate_seed())
        .index("index")
        .integer("v_1", 0, 10)
        .float("v_2", 0, 100)
        .float("v_3", 0, 100)
        .float("v_4", 0, 100)
        
        .np_distribution("e", "normal", loc=0, scale=100)
    )

    df = df.to_df()

    c_v1 = pl.col("v_1")
    c_v2 = pl.col("v_2")
    c_v3 = pl.col("v_3")
    c_v4 = pl.col("v_4")
    e = pl.col("e")
    df = df.with_columns(
        y=10*c_v1-5*c_v2+100*c_v3+c_v4 + e
    )

    df = (
            pl.concat(
                [
                    df,
                    bayes_bootstrap(
                        n_rows=n_rows,
                        n_draws=n_replicates+1,
                        seed=generate_seed(),
                        initial_weight_index=0,
                        prefix="weight_"
                    )
                ],
                how="horizontal"
            )
            .lazy()
    )
    
    return df
In [3]:
n_rows = 1_000
n_replicates = 10

logger.info("Create two datasets to compare, with replicate weights (or bootstrap weights)")
df = gen_random_table(n_rows, n_replicates, seed=1230)
df_compare = gen_random_table(n_rows, n_replicates, seed=3254)

logger.info("The data")
_ = summary(df)
Create two datasets to compare, with replicate weights (or bootstrap weights)
The data
┌───────────┬───────┬─────────────┬──────────────┬──────────────┬─────────────┬───────────────┐
│  Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆         min ┆           max │
╞═══════════╪═══════╪═════════════╪══════════════╪══════════════╪═════════════╪═══════════════╡
│     index ┆ 1,000 ┆           0 ┆        499.5 ┆   288.819436 ┆         0.0 ┆         999.0 │
│       v_1 ┆ 1,000 ┆           0 ┆        5.128 ┆      3.17832 ┆         0.0 ┆          10.0 │
│       v_2 ┆ 1,000 ┆           0 ┆    52.096715 ┆    29.123653 ┆      0.0944 ┆     99.769011 │
│       v_3 ┆ 1,000 ┆           0 ┆    50.852027 ┆    28.367909 ┆    0.060494 ┆     99.948326 │
│       v_4 ┆ 1,000 ┆           0 ┆    51.103943 ┆    28.838153 ┆    0.101189 ┆     99.570176 │
│         e ┆ 1,000 ┆           0 ┆     2.416973 ┆    99.344919 ┆ -297.765179 ┆    353.500711 │
│         y ┆ 1,000 ┆           0 ┆ 4,929.520027 ┆ 2,846.548527 ┆ -388.726059 ┆ 10,027.774643 │
│  weight_0 ┆ 1,000 ┆           0 ┆     0.990942 ┆     0.959858 ┆    0.000929 ┆      5.832272 │
│  weight_1 ┆ 1,000 ┆           0 ┆     0.979466 ┆     1.031472 ┆    0.000465 ┆      8.610054 │
│  weight_2 ┆ 1,000 ┆           0 ┆     1.000521 ┆     0.982858 ┆    0.000443 ┆      6.123822 │
│  weight_3 ┆ 1,000 ┆           0 ┆      0.97896 ┆     0.993243 ┆    0.001089 ┆      8.561238 │
│  weight_4 ┆ 1,000 ┆           0 ┆     0.999087 ┆     0.954473 ┆     0.00275 ┆      7.249914 │
│  weight_5 ┆ 1,000 ┆           0 ┆     1.005718 ┆     0.977519 ┆     0.00222 ┆       6.85744 │
│  weight_6 ┆ 1,000 ┆           0 ┆     1.002752 ┆     1.014994 ┆    0.001331 ┆      7.676241 │
│  weight_7 ┆ 1,000 ┆           0 ┆     1.004118 ┆      0.96822 ┆    0.000878 ┆       7.03514 │
│  weight_8 ┆ 1,000 ┆           0 ┆     1.006493 ┆     0.943742 ┆    0.000563 ┆      5.516431 │
│  weight_9 ┆ 1,000 ┆           0 ┆     0.996047 ┆     0.997422 ┆    0.001608 ┆        6.8256 │
│ weight_10 ┆ 1,000 ┆           0 ┆     1.059505 ┆     1.071869 ┆    0.000018 ┆      7.749995 │
└───────────┴───────┴─────────────┴──────────────┴──────────────┴─────────────┴───────────────┘
In [4]:
logger.info("To run an arbitrary stat, you need")
logger.info("   any function that takes a dataframe and a weight and returns a dataframe")
logger.info("The function will be run for each boostrap/replicate weight")
def run_regression(df:pl.LazyFrame | pl.DataFrame,
                   y:str,
                   X:list[str],
                   weight:str) -> pl.LazyFrame | pl.DataFrame:
    df = df.lazy().collect()
    model = LinearRegression()
    model.fit(
            X=df.select(X),
            y=df.select(y),
            sample_weight=df[weight]
        )
    

    df_betas = pl.DataFrame(
            dict(
                Variable=df.select(X).lazy().collect_schema().names() + ["_Intercept_"],
                Beta=[
                    float(vali)
                    for vali in [float(coefi) for coefi in model.coef_[0]] + [float(model.intercept_[0])]
                ],
            )
        )
    
    return df_betas
To run an arbitrary stat, you need
   any function that takes a dataframe and a weight and returns a dataframe
The function will be run for each boostrap/replicate weight
In [5]:
logger.info("The betas from the regression")
df_betas = run_regression(df=df,y="y",X=columns_from_list(df,["v_*"]),weight="weight_0")
logger.info(df_betas)



logger.info("Call StatCalculator.from_function")
logger.info("   Pass in the data")
logger.info("   'estimate_ids': the name of the column(s) that index the estimates")
logger.info("   'arguments':    stable (non df and non-weight) arguments")
logger.info("   'replicates':   weights to loop over")
sc = StatCalculator.from_function(
    run_regression,
    df=df,
    estimate_ids=["Variable"],
    arguments=dict(
        y="y",
        X=columns_from_list(df,["v_*"])
    ),
    replicates=Replicates(
        weight_stub="weight_",
        n_replicates=n_replicates,
        bootstrap=True
    ),
    display=False
)


logger.info("The result")
sc.print()
The betas from the regression
shape: (5, 2)
┌─────────────┬────────────┐
│ Variable    ┆ Beta       │
│ ---         ┆ ---        │
│ str         ┆ f64        │
╞═════════════╪════════════╡
│ v_1         ┆ 10.673025  │
│ v_2         ┆ -5.061185  │
│ v_3         ┆ 100.09039  │
│ v_4         ┆ 1.081332   │
│ _Intercept_ ┆ -14.280409 │
└─────────────┴────────────┘
Call StatCalculator.from_function
   Pass in the data
   'estimate_ids': the name of the column(s) that index the estimates
   'arguments':    stable (non df and non-weight) arguments
   'replicates':   weights to loop over
Running run_regression
0
.
.
.
.
5
.
.
.
.
10
The result
┌─────────────┬────────────┐
│    Variable ┆       Beta │
╞═════════════╪════════════╡
│         v_1 ┆  10.673025 │
│             ┆   1.639466 │
│         v_2 ┆  -5.061185 │
│             ┆   0.193846 │
│         v_3 ┆  100.09039 │
│             ┆   0.263585 │
│         v_4 ┆   1.081332 │
│             ┆   0.186253 │
│ _Intercept_ ┆ -14.280409 │
│             ┆  35.065297 │
└─────────────┴────────────┘

In [6]:
logger.info("You can compare it like any other stat, run the same regression on another sample")
logger.info("   Run the regression on a different random draw")
sc_compare = StatCalculator.from_function(
    run_regression,
    df=df_compare,
    estimate_ids=["Variable"],
    arguments=dict(
        y="y",
        X=columns_from_list(df,["v_*"])
    ),
    replicates=Replicates(
        weight_stub="weight_",
        n_replicates=n_replicates,
        bootstrap=True
    ),
    display=False
)
You can compare it like any other stat, run the same regression on another sample
   Run the regression on a different random draw
Running run_regression
0
.
.
.
.
5
.
.
.
.
10

In [7]:
logger.info("   And compare the results")
d_comp = sc.compare(sc_compare)
   And compare the results
Comparing estimates using bootstrap weights
  Difference
┌─────────────┬────────────┐
│    Variable ┆       Beta │
╞═════════════╪════════════╡
│         v_1 ┆   0.834108 │
│             ┆   1.395571 │
│         v_2 ┆   0.123172 │
│             ┆   0.122854 │
│         v_3 ┆  -0.016832 │
│             ┆   0.167508 │
│         v_4 ┆    0.05095 │
│             ┆   0.114858 │
│ _Intercept_ ┆ -10.521408 │
│             ┆  19.993532 │
└─────────────┴────────────┘

  Ratio
┌─────────────┬───────────┐
│    Variable ┆      Beta │
╞═════════════╪═══════════╡
│         v_1 ┆  0.078151 │
│             ┆  0.133234 │
│         v_2 ┆ -0.024337 │
│             ┆  0.024386 │
│         v_3 ┆ -0.000168 │
│             ┆  0.001675 │
│         v_4 ┆  0.047118 │
│             ┆  0.111966 │
│ _Intercept_ ┆  0.736772 │
│             ┆  9.945254 │
└─────────────┴───────────┘