In [1]:
from __future__ import annotations

import polars as pl
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.statistics.calculator import StatCalculator,ComparisonItem
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed

from survey_kit import logger, config
In [2]:
# Draw some random data
def gen_random_table(n_rows: int, n_replicates: int, seed: int):
    set_seed(seed)
    df = (
        RandomData(n_rows=n_rows, seed=generate_seed())
        .index("index")
        .integer("v_1", 0, 10)
        .integer("year", 2016, 2021)
        .integer("month", 1, 12)
        .integer("income", 0, 100_000)
        .integer("income_2", 0, 120_000)
    ).to_df()


    #   Attach bootstrap weights to the data
    df = (
            pl.concat(
                [
                    df,
                    bayes_bootstrap(
                        n_rows=n_rows,
                        n_draws=n_replicates+1,
                        seed=generate_seed(),
                        initial_weight_index=0,
                        prefix="weight_"
                    )
                ],
                how="horizontal"
            )
            .lazy()
    )

    df = df.with_columns(
        pl.when(pl.col("year").ne(2016)).then(pl.col("income")).otherwise(pl.lit(0))
    )

    return df

n_rows = 1_000
n_replicates = 10

logger.info("Create two datasets to compare, with bootstrap weights (or repliate weights)")
df = gen_random_table(n_rows, n_replicates, seed=1230)
df_compare = gen_random_table(n_rows, n_replicates, seed=9324)
Create two datasets to compare, with bootstrap weights (or repliate weights)
In [3]:
logger.info("What's the data look like")
logger.info(df.head(10).collect())
logger.info(df_compare.head(10).collect())
What's the data look like
shape: (10, 17)
┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐
│ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │
│ ---   ┆ --- ┆ ---  ┆ ---   ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---       │
│ i16   ┆ i8  ┆ i16  ┆ i8    ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64       │
╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡
│ 0     ┆ 6   ┆ 2020 ┆ 1     ┆ … ┆ 0.570284 ┆ 0.119116 ┆ 0.55898  ┆ 1.004299  │
│ 1     ┆ 3   ┆ 2020 ┆ 10    ┆ … ┆ 0.643672 ┆ 1.702258 ┆ 1.289928 ┆ 0.611723  │
│ 2     ┆ 10  ┆ 2018 ┆ 2     ┆ … ┆ 0.162629 ┆ 0.201664 ┆ 0.397503 ┆ 2.026226  │
│ 3     ┆ 8   ┆ 2019 ┆ 6     ┆ … ┆ 1.359847 ┆ 0.57938  ┆ 0.396003 ┆ 0.585894  │
│ 4     ┆ 8   ┆ 2020 ┆ 4     ┆ … ┆ 0.824662 ┆ 3.389581 ┆ 0.033046 ┆ 0.329273  │
│ 5     ┆ 1   ┆ 2021 ┆ 12    ┆ … ┆ 0.894158 ┆ 0.631069 ┆ 1.798796 ┆ 0.189096  │
│ 6     ┆ 1   ┆ 2020 ┆ 9     ┆ … ┆ 0.734065 ┆ 0.451423 ┆ 0.723049 ┆ 3.800384  │
│ 7     ┆ 3   ┆ 2016 ┆ 4     ┆ … ┆ 0.224369 ┆ 1.102906 ┆ 3.079108 ┆ 2.1932    │
│ 8     ┆ 8   ┆ 2019 ┆ 8     ┆ … ┆ 0.145038 ┆ 0.000563 ┆ 0.950629 ┆ 1.396732  │
│ 9     ┆ 8   ┆ 2019 ┆ 4     ┆ … ┆ 0.009974 ┆ 0.320043 ┆ 2.697634 ┆ 1.513618  │
└───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘
shape: (10, 17)
┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐
│ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │
│ ---   ┆ --- ┆ ---  ┆ ---   ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---       │
│ i16   ┆ i8  ┆ i16  ┆ i8    ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64       │
╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡
│ 0     ┆ 2   ┆ 2020 ┆ 1     ┆ … ┆ 0.69209  ┆ 0.103228 ┆ 0.937928 ┆ 0.167401  │
│ 1     ┆ 1   ┆ 2021 ┆ 2     ┆ … ┆ 0.036011 ┆ 1.71169  ┆ 4.192822 ┆ 1.058857  │
│ 2     ┆ 10  ┆ 2016 ┆ 11    ┆ … ┆ 0.058946 ┆ 0.708083 ┆ 1.675276 ┆ 0.070896  │
│ 3     ┆ 3   ┆ 2017 ┆ 12    ┆ … ┆ 0.113939 ┆ 4.96422  ┆ 0.630725 ┆ 0.949141  │
│ 4     ┆ 4   ┆ 2020 ┆ 1     ┆ … ┆ 2.313458 ┆ 4.076666 ┆ 1.078212 ┆ 0.227196  │
│ 5     ┆ 3   ┆ 2016 ┆ 8     ┆ … ┆ 0.177681 ┆ 0.064829 ┆ 1.844922 ┆ 0.182302  │
│ 6     ┆ 1   ┆ 2021 ┆ 1     ┆ … ┆ 0.095817 ┆ 2.067933 ┆ 0.057999 ┆ 0.283257  │
│ 7     ┆ 8   ┆ 2016 ┆ 5     ┆ … ┆ 0.211089 ┆ 0.997663 ┆ 0.985377 ┆ 1.674629  │
│ 8     ┆ 8   ┆ 2016 ┆ 9     ┆ … ┆ 0.023346 ┆ 1.191023 ┆ 1.347318 ┆ 1.71358   │
│ 9     ┆ 2   ┆ 2018 ┆ 11    ┆ … ┆ 0.005274 ┆ 0.359494 ┆ 2.237734 ┆ 1.160059  │
└───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘
In [4]:
logger.info("Use the stat calculator class to get stats")
logger.info("   The basic option, just get some stats")
stats = statistics=Statistics(
    stats=["mean", "median|not0"], 
    columns=["v_1", "income", "income_2"]
)

logger.info("What is available:")
logger.info(Statistics.available_stats())
Use the stat calculator class to get stats
   The basic option, just get some stats
What is available:

Some examples: ['mean', 'sum', 'median', 'q10', 'q97.5', 'std', 'var', 'max', 'min', 'weight', 'n', 'gini']

Stats can also have 'modifiers' appended to them separated by a pipe ('|'), including
['not0', 'missing', 'notmissing', 'is0', 'share']

For quantiles, pass q{number} where number in (0,100)

n is the unweighted count and weight is the weighted count
     for n/weight: ['not0', 'missing', 'notmissing', 'is0', 'share']
     for all other stats: ['not0']

Some examples: ['mean|not0', 'sum|not0', 'median', 'min|not0', 'count|missing', 'n|notmissing', 'n|share']
None
In [5]:
logger.info("Estimate them")
sc = StatCalculator(
    df,
    statistics=stats,
    weight="weight_0",
)
Estimate them
┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆      5.149512 ┆              6 │
│   income ┆ 44,315.163945 ┆         49,134 │
│ income_2 ┆ 60,360.890519 ┆         60,477 │
└──────────┴───────────────┴────────────────┘
In [6]:
logger.info("   Want standard errors?")
logger.info("   Set up 'Replicates'")
logger.info("       from data (with df)")
logger.info("       NOTE:   if pass 'bootstrap=True', it will calculate ")
logger.info("               regular bootstrap SEs rather than balanced repeated replication SEs")
logger.info("               (if you don't know what that means, pass bootstrap=True)")
replicates_from_df = Replicates(
    df=df,
    weight_stub="weight_",
    bootstrap=True
)

logger.info("       or just tell it the number (n_replicates)")
replicates = Replicates(
    weight_stub="weight_",
    n_replicates=n_replicates,
    bootstrap=True
)
logger.info("       Same either way")
logger.info(replicates_from_df.rep_list)
logger.info(replicates.rep_list)
del replicates_from_df
   Want standard errors?
   Set up 'Replicates'
       from data (with df)
       NOTE:   if pass 'bootstrap=True', it will calculate 
               regular bootstrap SEs rather than balanced repeated replication SEs
               (if you don't know what that means, pass bootstrap=True)
       or just tell it the number (n_replicates)
       Same either way
['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']
['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']
In [7]:
logger.info("\n\nNow let's calculate the stats with replicate-weighted standard errors")
logger.info("   for df")
logger.info("   (where each estimate has the SE below the estimate)")
sc = StatCalculator(
    df,
    statistics=stats,
    weight="weight_0",
    replicates=replicates
)

logger.info("   for df_compare")
sc_compare = StatCalculator(
    df_compare,
    statistics=stats,
    weight="weight_0",
    replicates=replicates
)


logger.info("Save them for later")
sc.save(f"{config.path_temp_files}/sc")
sc_compare.save(f"{config.path_temp_files}/sc_compare")

del sc
del sc_compare

Now let's calculate the stats with replicate-weighted standard errors
   for df
   (where each estimate has the SE below the estimate)
0
.
.
.
.
5
.
.
.
.
10
┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆      5.149512 ┆            6.0 │
│          ┆      0.061678 ┆            0.4 │
│   income ┆ 44,315.163945 ┆       49,134.0 │
│          ┆  1,059.552194 ┆   2,135.895325 │
│ income_2 ┆ 60,360.890519 ┆       60,477.0 │
│          ┆    776.314542 ┆   1,745.663625 │
└──────────┴───────────────┴────────────────┘
   for df_compare
0
.
.
.
.
5
.
.
.
.
10
┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆      4.788895 ┆            5.0 │
│          ┆      0.090114 ┆            0.4 │
│   income ┆ 42,382.317047 ┆       49,699.0 │
│          ┆   1,165.78045 ┆   2,146.750251 │
│ income_2 ┆ 61,894.696943 ┆       64,877.0 │
│          ┆  1,004.498361 ┆   2,008.185161 │
└──────────┴───────────────┴────────────────┘
Save them for later
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/sc.stats_calc
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/sc_compare.stats_calc

In [8]:
logger.info("Load them back up")
sc = StatCalculator.load(f"{config.path_temp_files}/sc")
sc_compare = StatCalculator.load(f"{config.path_temp_files}/sc_compare")
Load them back up
In [9]:
logger.info("\n\nWe can compare them easily")
d_comparisons = sc.compare(sc_compare)

logger.info("Which gives us a dictionary of comparisons, with keys=['difference','ratio']")
logger.info(d_comparisons['difference'])
logger.info(d_comparisons['ratio'])

We can compare them easily
Comparing estimates using bootstrap weights
  Difference
┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆     -0.360617 ┆           -1.0 │
│          ┆      0.097138 ┆       0.489898 │
│   income ┆ -1,932.846898 ┆          565.0 │
│          ┆  1,300.122127 ┆   2,985.945706 │
│ income_2 ┆  1,533.806424 ┆        4,400.0 │
│          ┆  1,527.354628 ┆   3,275.316047 │
└──────────┴───────────────┴────────────────┘

  Ratio
┌──────────┬───────────┬────────────────┐
│ Variable ┆      mean ┆ median (not 0) │
╞══════════╪═══════════╪════════════════╡
│      v_1 ┆ -0.070029 ┆      -0.166667 │
│          ┆  0.018631 ┆        0.08165 │
│   income ┆ -0.043616 ┆       0.011499 │
│          ┆  0.030181 ┆       0.056944 │
│ income_2 ┆  0.025411 ┆       0.072755 │
│          ┆  0.025124 ┆       0.053336 │
└──────────┴───────────┴────────────────┘

Which gives us a dictionary of comparisons, with keys=['difference','ratio']
<survey_kit.statistics.calculator.StatCalculator object at 0x0000017865FC28D0>
<survey_kit.statistics.calculator.StatCalculator object at 0x0000017865FE5480>
In [10]:
logger.info("\n\nLet's just get the difference between the means to medians, within the same data")
logger.info("   For df")
sc_mean_median = sc.compare(
    sc,
    ratio=False,
    compare_list_columns=[
        ComparisonItem.Column(
            "mean",
            "median (not 0)",
            name="median_mean"
        )
    ])["difference"]

logger.info("   For df_compare")
sc_compare_mean_median = sc_compare.compare(
    sc_compare,
    ratio=False,
    compare_list_columns=[
        ComparisonItem.Column(
            "mean",
            "median (not 0)",
            name="median_mean"
        )
    ])["difference"]

Let's just get the difference between the means to medians, within the same data
   For df
Comparing estimates using bootstrap weights
  Difference
┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆     0.850488 │
│          ┆     0.362602 │
│   income ┆ 4,818.836055 │
│          ┆ 2,440.054996 │
│ income_2 ┆   116.109481 │
│          ┆ 1,138.679381 │
└──────────┴──────────────┘

   For df_compare
Comparing estimates using bootstrap weights
  Difference
┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆     0.211105 │
│          ┆      0.35478 │
│   income ┆ 7,316.682953 │
│          ┆  1,329.96631 │
│ income_2 ┆ 2,982.303057 │
│          ┆ 1,409.876125 │
└──────────┴──────────────┘

In [11]:
logger.info("\n\nNow, we can use the same tool to get a difference in difference comparison")
logger.info("   i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]")
d_diff_in_diffs = sc_mean_median.compare(sc_compare_mean_median,ratio=False)

Now, we can use the same tool to get a difference in difference comparison
   i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]
Comparing estimates using bootstrap weights
  Difference
┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆    -0.639383 │
│          ┆     0.433448 │
│   income ┆ 2,497.846898 │
│          ┆ 3,137.994513 │
│ income_2 ┆ 2,866.193576 │
│          ┆  2,125.69276 │
└──────────┴──────────────┘

In [12]:
logger.info("\n\nLet's get the ratio two variables")
logger.info("   rather than the difference between two statistics for the same variable)")

logger.info("   For df")
sc_inc_inc_2 = sc.compare(
    sc,
    difference=False,
    compare_list_variables=[
        ComparisonItem.Variable(
            value1="income",
            value2="income_2",
            name="income_comp"
        )
    ])["ratio"]

logger.info("   For df_compare")
sc_inc_inc_2_compare = sc_compare.compare(
    sc_compare,
    difference=False,
    compare_list_variables=[
        ComparisonItem.Variable(
            value1="income",
            value2="income_2",
            name="income_comp"
        )
    ])["ratio"]

Let's get the ratio two variables
   rather than the difference between two statistics for the same variable)
   For df
Comparing estimates using bootstrap weights
  Ratio
┌─────────────┬──────────┬────────────────┐
│    Variable ┆     mean ┆ median (not 0) │
╞═════════════╪══════════╪════════════════╡
│ income_comp ┆ 0.362082 ┆       0.230858 │
│             ┆ 0.027344 ┆       0.055648 │
└─────────────┴──────────┴────────────────┘

   For df_compare
Comparing estimates using bootstrap weights
  Ratio
┌─────────────┬──────────┬────────────────┐
│    Variable ┆     mean ┆ median (not 0) │
╞═════════════╪══════════╪════════════════╡
│ income_comp ┆  0.46039 ┆       0.305398 │
│             ┆ 0.045438 ┆       0.065943 │
└─────────────┴──────────┴────────────────┘

In [13]:
logger.info("\n\nAnd for fun, the diff-in-diff")
logger.info("   i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]")

d_diff_in_diffs_2 = sc_inc_inc_2.compare(sc_inc_inc_2_compare,ratio=False)

And for fun, the diff-in-diff
   i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]
Comparing estimates using bootstrap weights
  Difference
┌─────────────┬──────────┬────────────────┐
│    Variable ┆     mean ┆ median (not 0) │
╞═════════════╪══════════╪════════════════╡
│ income_comp ┆ 0.098308 ┆        0.07454 │
│             ┆ 0.047934 ┆       0.081155 │
└─────────────┴──────────┴────────────────┘