In [1]:
from __future__ import annotations
import polars as pl
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.statistics.calculator import StatCalculator,ComparisonItem
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed
from survey_kit import logger, config
In [2]:
# Draw some random data
def gen_random_table(n_rows: int, n_replicates: int, seed: int):
set_seed(seed)
df = (
RandomData(n_rows=n_rows, seed=generate_seed())
.index("index")
.integer("v_1", 0, 10)
.integer("year", 2016, 2021)
.integer("month", 1, 12)
.integer("income", 0, 100_000)
.integer("income_2", 0, 120_000)
).to_df()
# Attach bootstrap weights to the data
df = (
pl.concat(
[
df,
bayes_bootstrap(
n_rows=n_rows,
n_draws=n_replicates+1,
seed=generate_seed(),
initial_weight_index=0,
prefix="weight_"
)
],
how="horizontal"
)
.lazy()
)
df = df.with_columns(
pl.when(pl.col("year").ne(2016)).then(pl.col("income")).otherwise(pl.lit(0))
)
return df
n_rows = 1_000
n_replicates = 10
logger.info("Create two datasets to compare, with bootstrap weights (or repliate weights)")
df = gen_random_table(n_rows, n_replicates, seed=1230)
df_compare = gen_random_table(n_rows, n_replicates, seed=9324)
Create two datasets to compare, with bootstrap weights (or repliate weights)
In [3]:
logger.info("What's the data look like")
logger.info(df.head(10).collect())
logger.info(df_compare.head(10).collect())
What's the data look like
shape: (10, 17) ┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐ │ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i16 ┆ i8 ┆ i16 ┆ i8 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡ │ 0 ┆ 6 ┆ 2020 ┆ 1 ┆ … ┆ 0.570284 ┆ 0.119116 ┆ 0.55898 ┆ 1.004299 │ │ 1 ┆ 3 ┆ 2020 ┆ 10 ┆ … ┆ 0.643672 ┆ 1.702258 ┆ 1.289928 ┆ 0.611723 │ │ 2 ┆ 10 ┆ 2018 ┆ 2 ┆ … ┆ 0.162629 ┆ 0.201664 ┆ 0.397503 ┆ 2.026226 │ │ 3 ┆ 8 ┆ 2019 ┆ 6 ┆ … ┆ 1.359847 ┆ 0.57938 ┆ 0.396003 ┆ 0.585894 │ │ 4 ┆ 8 ┆ 2020 ┆ 4 ┆ … ┆ 0.824662 ┆ 3.389581 ┆ 0.033046 ┆ 0.329273 │ │ 5 ┆ 1 ┆ 2021 ┆ 12 ┆ … ┆ 0.894158 ┆ 0.631069 ┆ 1.798796 ┆ 0.189096 │ │ 6 ┆ 1 ┆ 2020 ┆ 9 ┆ … ┆ 0.734065 ┆ 0.451423 ┆ 0.723049 ┆ 3.800384 │ │ 7 ┆ 3 ┆ 2016 ┆ 4 ┆ … ┆ 0.224369 ┆ 1.102906 ┆ 3.079108 ┆ 2.1932 │ │ 8 ┆ 8 ┆ 2019 ┆ 8 ┆ … ┆ 0.145038 ┆ 0.000563 ┆ 0.950629 ┆ 1.396732 │ │ 9 ┆ 8 ┆ 2019 ┆ 4 ┆ … ┆ 0.009974 ┆ 0.320043 ┆ 2.697634 ┆ 1.513618 │ └───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘
shape: (10, 17) ┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐ │ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i16 ┆ i8 ┆ i16 ┆ i8 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡ │ 0 ┆ 2 ┆ 2020 ┆ 1 ┆ … ┆ 0.69209 ┆ 0.103228 ┆ 0.937928 ┆ 0.167401 │ │ 1 ┆ 1 ┆ 2021 ┆ 2 ┆ … ┆ 0.036011 ┆ 1.71169 ┆ 4.192822 ┆ 1.058857 │ │ 2 ┆ 10 ┆ 2016 ┆ 11 ┆ … ┆ 0.058946 ┆ 0.708083 ┆ 1.675276 ┆ 0.070896 │ │ 3 ┆ 3 ┆ 2017 ┆ 12 ┆ … ┆ 0.113939 ┆ 4.96422 ┆ 0.630725 ┆ 0.949141 │ │ 4 ┆ 4 ┆ 2020 ┆ 1 ┆ … ┆ 2.313458 ┆ 4.076666 ┆ 1.078212 ┆ 0.227196 │ │ 5 ┆ 3 ┆ 2016 ┆ 8 ┆ … ┆ 0.177681 ┆ 0.064829 ┆ 1.844922 ┆ 0.182302 │ │ 6 ┆ 1 ┆ 2021 ┆ 1 ┆ … ┆ 0.095817 ┆ 2.067933 ┆ 0.057999 ┆ 0.283257 │ │ 7 ┆ 8 ┆ 2016 ┆ 5 ┆ … ┆ 0.211089 ┆ 0.997663 ┆ 0.985377 ┆ 1.674629 │ │ 8 ┆ 8 ┆ 2016 ┆ 9 ┆ … ┆ 0.023346 ┆ 1.191023 ┆ 1.347318 ┆ 1.71358 │ │ 9 ┆ 2 ┆ 2018 ┆ 11 ┆ … ┆ 0.005274 ┆ 0.359494 ┆ 2.237734 ┆ 1.160059 │ └───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘
In [4]:
logger.info("Use the stat calculator class to get stats")
logger.info(" The basic option, just get some stats")
stats = statistics=Statistics(
stats=["mean", "median|not0"],
columns=["v_1", "income", "income_2"]
)
logger.info("What is available:")
logger.info(Statistics.available_stats())
Use the stat calculator class to get stats
The basic option, just get some stats
What is available:
Some examples: ['mean', 'sum', 'median', 'q10', 'q97.5', 'std', 'var', 'max', 'min', 'weight', 'n', 'gini']
Stats can also have 'modifiers' appended to them separated by a pipe ('|'), including
['not0', 'missing', 'notmissing', 'is0', 'share']
For quantiles, pass q{number} where number in (0,100)
n is the unweighted count and weight is the weighted count
for n/weight: ['not0', 'missing', 'notmissing', 'is0', 'share']
for all other stats: ['not0']
Some examples: ['mean|not0', 'sum|not0', 'median', 'min|not0', 'count|missing', 'n|notmissing', 'n|share']
None
In [5]:
logger.info("Estimate them")
sc = StatCalculator(
df,
statistics=stats,
weight="weight_0",
)
Estimate them
┌──────────┬───────────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞══════════╪═══════════════╪════════════════╡ │ v_1 ┆ 5.149512 ┆ 6 │ │ income ┆ 44,315.163945 ┆ 49,134 │ │ income_2 ┆ 60,360.890519 ┆ 60,477 │ └──────────┴───────────────┴────────────────┘
In [6]:
logger.info(" Want standard errors?")
logger.info(" Set up 'Replicates'")
logger.info(" from data (with df)")
logger.info(" NOTE: if pass 'bootstrap=True', it will calculate ")
logger.info(" regular bootstrap SEs rather than balanced repeated replication SEs")
logger.info(" (if you don't know what that means, pass bootstrap=True)")
replicates_from_df = Replicates(
df=df,
weight_stub="weight_",
bootstrap=True
)
logger.info(" or just tell it the number (n_replicates)")
replicates = Replicates(
weight_stub="weight_",
n_replicates=n_replicates,
bootstrap=True
)
logger.info(" Same either way")
logger.info(replicates_from_df.rep_list)
logger.info(replicates.rep_list)
del replicates_from_df
Want standard errors?
Set up 'Replicates'
from data (with df)
NOTE: if pass 'bootstrap=True', it will calculate
regular bootstrap SEs rather than balanced repeated replication SEs
(if you don't know what that means, pass bootstrap=True)
or just tell it the number (n_replicates)
Same either way
['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']
['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']
In [7]:
logger.info("\n\nNow let's calculate the stats with replicate-weighted standard errors")
logger.info(" for df")
logger.info(" (where each estimate has the SE below the estimate)")
sc = StatCalculator(
df,
statistics=stats,
weight="weight_0",
replicates=replicates
)
logger.info(" for df_compare")
sc_compare = StatCalculator(
df_compare,
statistics=stats,
weight="weight_0",
replicates=replicates
)
logger.info("Save them for later")
sc.save(f"{config.path_temp_files}/sc")
sc_compare.save(f"{config.path_temp_files}/sc_compare")
del sc
del sc_compare
Now let's calculate the stats with replicate-weighted standard errors
for df
(where each estimate has the SE below the estimate)
0
.
.
.
.
5
.
.
.
.
10
┌──────────┬───────────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞══════════╪═══════════════╪════════════════╡ │ v_1 ┆ 5.149512 ┆ 6.0 │ │ ┆ 0.061678 ┆ 0.4 │ │ income ┆ 44,315.163945 ┆ 49,134.0 │ │ ┆ 1,059.552194 ┆ 2,135.895325 │ │ income_2 ┆ 60,360.890519 ┆ 60,477.0 │ │ ┆ 776.314542 ┆ 1,745.663625 │ └──────────┴───────────────┴────────────────┘
for df_compare
0
.
.
.
.
5
.
.
.
.
10
┌──────────┬───────────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞══════════╪═══════════════╪════════════════╡ │ v_1 ┆ 4.788895 ┆ 5.0 │ │ ┆ 0.090114 ┆ 0.4 │ │ income ┆ 42,382.317047 ┆ 49,699.0 │ │ ┆ 1,165.78045 ┆ 2,146.750251 │ │ income_2 ┆ 61,894.696943 ┆ 64,877.0 │ │ ┆ 1,004.498361 ┆ 2,008.185161 │ └──────────┴───────────────┴────────────────┘
Save them for later
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/sc.stats_calc
Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/sc_compare.stats_calc
In [8]:
logger.info("Load them back up")
sc = StatCalculator.load(f"{config.path_temp_files}/sc")
sc_compare = StatCalculator.load(f"{config.path_temp_files}/sc_compare")
Load them back up
In [9]:
logger.info("\n\nWe can compare them easily")
d_comparisons = sc.compare(sc_compare)
logger.info("Which gives us a dictionary of comparisons, with keys=['difference','ratio']")
logger.info(d_comparisons['difference'])
logger.info(d_comparisons['ratio'])
We can compare them easily
Comparing estimates using bootstrap weights
Difference
┌──────────┬───────────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞══════════╪═══════════════╪════════════════╡ │ v_1 ┆ -0.360617 ┆ -1.0 │ │ ┆ 0.097138 ┆ 0.489898 │ │ income ┆ -1,932.846898 ┆ 565.0 │ │ ┆ 1,300.122127 ┆ 2,985.945706 │ │ income_2 ┆ 1,533.806424 ┆ 4,400.0 │ │ ┆ 1,527.354628 ┆ 3,275.316047 │ └──────────┴───────────────┴────────────────┘
Ratio
┌──────────┬───────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞══════════╪═══════════╪════════════════╡ │ v_1 ┆ -0.070029 ┆ -0.166667 │ │ ┆ 0.018631 ┆ 0.08165 │ │ income ┆ -0.043616 ┆ 0.011499 │ │ ┆ 0.030181 ┆ 0.056944 │ │ income_2 ┆ 0.025411 ┆ 0.072755 │ │ ┆ 0.025124 ┆ 0.053336 │ └──────────┴───────────┴────────────────┘
Which gives us a dictionary of comparisons, with keys=['difference','ratio']
<survey_kit.statistics.calculator.StatCalculator object at 0x0000017865FC28D0>
<survey_kit.statistics.calculator.StatCalculator object at 0x0000017865FE5480>
In [10]:
logger.info("\n\nLet's just get the difference between the means to medians, within the same data")
logger.info(" For df")
sc_mean_median = sc.compare(
sc,
ratio=False,
compare_list_columns=[
ComparisonItem.Column(
"mean",
"median (not 0)",
name="median_mean"
)
])["difference"]
logger.info(" For df_compare")
sc_compare_mean_median = sc_compare.compare(
sc_compare,
ratio=False,
compare_list_columns=[
ComparisonItem.Column(
"mean",
"median (not 0)",
name="median_mean"
)
])["difference"]
Let's just get the difference between the means to medians, within the same data
For df
Comparing estimates using bootstrap weights
Difference
┌──────────┬──────────────┐ │ Variable ┆ median_mean │ ╞══════════╪══════════════╡ │ v_1 ┆ 0.850488 │ │ ┆ 0.362602 │ │ income ┆ 4,818.836055 │ │ ┆ 2,440.054996 │ │ income_2 ┆ 116.109481 │ │ ┆ 1,138.679381 │ └──────────┴──────────────┘
For df_compare
Comparing estimates using bootstrap weights
Difference
┌──────────┬──────────────┐ │ Variable ┆ median_mean │ ╞══════════╪══════════════╡ │ v_1 ┆ 0.211105 │ │ ┆ 0.35478 │ │ income ┆ 7,316.682953 │ │ ┆ 1,329.96631 │ │ income_2 ┆ 2,982.303057 │ │ ┆ 1,409.876125 │ └──────────┴──────────────┘
In [11]:
logger.info("\n\nNow, we can use the same tool to get a difference in difference comparison")
logger.info(" i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]")
d_diff_in_diffs = sc_mean_median.compare(sc_compare_mean_median,ratio=False)
Now, we can use the same tool to get a difference in difference comparison
i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]
Comparing estimates using bootstrap weights
Difference
┌──────────┬──────────────┐ │ Variable ┆ median_mean │ ╞══════════╪══════════════╡ │ v_1 ┆ -0.639383 │ │ ┆ 0.433448 │ │ income ┆ 2,497.846898 │ │ ┆ 3,137.994513 │ │ income_2 ┆ 2,866.193576 │ │ ┆ 2,125.69276 │ └──────────┴──────────────┘
In [12]:
logger.info("\n\nLet's get the ratio two variables")
logger.info(" rather than the difference between two statistics for the same variable)")
logger.info(" For df")
sc_inc_inc_2 = sc.compare(
sc,
difference=False,
compare_list_variables=[
ComparisonItem.Variable(
value1="income",
value2="income_2",
name="income_comp"
)
])["ratio"]
logger.info(" For df_compare")
sc_inc_inc_2_compare = sc_compare.compare(
sc_compare,
difference=False,
compare_list_variables=[
ComparisonItem.Variable(
value1="income",
value2="income_2",
name="income_comp"
)
])["ratio"]
Let's get the ratio two variables
rather than the difference between two statistics for the same variable)
For df
Comparing estimates using bootstrap weights
Ratio
┌─────────────┬──────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞═════════════╪══════════╪════════════════╡ │ income_comp ┆ 0.362082 ┆ 0.230858 │ │ ┆ 0.027344 ┆ 0.055648 │ └─────────────┴──────────┴────────────────┘
For df_compare
Comparing estimates using bootstrap weights
Ratio
┌─────────────┬──────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞═════════════╪══════════╪════════════════╡ │ income_comp ┆ 0.46039 ┆ 0.305398 │ │ ┆ 0.045438 ┆ 0.065943 │ └─────────────┴──────────┴────────────────┘
In [13]:
logger.info("\n\nAnd for fun, the diff-in-diff")
logger.info(" i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]")
d_diff_in_diffs_2 = sc_inc_inc_2.compare(sc_inc_inc_2_compare,ratio=False)
And for fun, the diff-in-diff
i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]
Comparing estimates using bootstrap weights
Difference
┌─────────────┬──────────┬────────────────┐ │ Variable ┆ mean ┆ median (not 0) │ ╞═════════════╪══════════╪════════════════╡ │ income_comp ┆ 0.098308 ┆ 0.07454 │ │ ┆ 0.047934 ┆ 0.081155 │ └─────────────┴──────────┴────────────────┘