from __future__ import annotations

import polars as pl
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary
from survey_kit.statistics.calculator import StatCalculator, ComparisonItem
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit.statistics.bootstrap import bayes_bootstrap
from survey_kit.utilities.random import set_seed, generate_seed

from survey_kit import logger, config

# Draw some random data
def gen_random_table(n_rows: int, n_replicates: int, seed: int):
    set_seed(seed)
    df = (
        RandomData(n_rows=n_rows, seed=generate_seed())
        .index("index")
        .integer("v_1", 0, 10)
        .integer("year", 2016, 2021)
        .integer("month", 1, 12)
        .integer("income", 0, 100_000)
        .integer("income_2", 0, 120_000)
    ).to_df()

    #   Attach bootstrap weights to the data
    df = pl.concat(
        [
            df,
            bayes_bootstrap(
                n_rows=n_rows,
                n_draws=n_replicates + 1,
                seed=generate_seed(),
                initial_weight_index=0,
                prefix="weight_",
            ),
        ],
        how="horizontal",
    ).lazy()

    df = df.with_columns(
        pl.when(pl.col("year").ne(2016)).then(pl.col("income")).otherwise(pl.lit(0))
    )

    return df


n_rows = 1_000
n_replicates = 10

logger.info(
    "Create two datasets to compare, with bootstrap weights (or repliate weights)"
)
df = gen_random_table(n_rows, n_replicates, seed=1230)
df_compare = gen_random_table(n_rows, n_replicates, seed=9324)

Create two datasets to compare, with bootstrap weights (or repliate weights)

logger.info("What's the data look like")
logger.info(df.head(10).collect())
logger.info(df_compare.head(10).collect())

What's the data look like

shape: (10, 17)
┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐
│ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │
│ ---   ┆ --- ┆ ---  ┆ ---   ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---       │
│ i16   ┆ i8  ┆ i16  ┆ i8    ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64       │
╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡
│ 0     ┆ 6   ┆ 2020 ┆ 1     ┆ … ┆ 0.570284 ┆ 0.119116 ┆ 0.55898  ┆ 1.004299  │
│ 1     ┆ 3   ┆ 2020 ┆ 10    ┆ … ┆ 0.643672 ┆ 1.702258 ┆ 1.289928 ┆ 0.611723  │
│ 2     ┆ 10  ┆ 2018 ┆ 2     ┆ … ┆ 0.162629 ┆ 0.201664 ┆ 0.397503 ┆ 2.026226  │
│ 3     ┆ 8   ┆ 2019 ┆ 6     ┆ … ┆ 1.359847 ┆ 0.57938  ┆ 0.396003 ┆ 0.585894  │
│ 4     ┆ 8   ┆ 2020 ┆ 4     ┆ … ┆ 0.824662 ┆ 3.389581 ┆ 0.033046 ┆ 0.329273  │
│ 5     ┆ 1   ┆ 2021 ┆ 12    ┆ … ┆ 0.894158 ┆ 0.631069 ┆ 1.798796 ┆ 0.189096  │
│ 6     ┆ 1   ┆ 2020 ┆ 9     ┆ … ┆ 0.734065 ┆ 0.451423 ┆ 0.723049 ┆ 3.800384  │
│ 7     ┆ 3   ┆ 2016 ┆ 4     ┆ … ┆ 0.224369 ┆ 1.102906 ┆ 3.079108 ┆ 2.1932    │
│ 8     ┆ 8   ┆ 2019 ┆ 8     ┆ … ┆ 0.145038 ┆ 0.000563 ┆ 0.950629 ┆ 1.396732  │
│ 9     ┆ 8   ┆ 2019 ┆ 4     ┆ … ┆ 0.009974 ┆ 0.320043 ┆ 2.697634 ┆ 1.513618  │
└───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘

shape: (10, 17)
┌───────┬─────┬──────┬───────┬───┬──────────┬──────────┬──────────┬───────────┐
│ index ┆ v_1 ┆ year ┆ month ┆ … ┆ weight_7 ┆ weight_8 ┆ weight_9 ┆ weight_10 │
│ ---   ┆ --- ┆ ---  ┆ ---   ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---       │
│ i16   ┆ i8  ┆ i16  ┆ i8    ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64       │
╞═══════╪═════╪══════╪═══════╪═══╪══════════╪══════════╪══════════╪═══════════╡
│ 0     ┆ 2   ┆ 2020 ┆ 1     ┆ … ┆ 0.69209  ┆ 0.103228 ┆ 0.937928 ┆ 0.167401  │
│ 1     ┆ 1   ┆ 2021 ┆ 2     ┆ … ┆ 0.036011 ┆ 1.71169  ┆ 4.192822 ┆ 1.058857  │
│ 2     ┆ 10  ┆ 2016 ┆ 11    ┆ … ┆ 0.058946 ┆ 0.708083 ┆ 1.675276 ┆ 0.070896  │
│ 3     ┆ 3   ┆ 2017 ┆ 12    ┆ … ┆ 0.113939 ┆ 4.96422  ┆ 0.630725 ┆ 0.949141  │
│ 4     ┆ 4   ┆ 2020 ┆ 1     ┆ … ┆ 2.313458 ┆ 4.076666 ┆ 1.078212 ┆ 0.227196  │
│ 5     ┆ 3   ┆ 2016 ┆ 8     ┆ … ┆ 0.177681 ┆ 0.064829 ┆ 1.844922 ┆ 0.182302  │
│ 6     ┆ 1   ┆ 2021 ┆ 1     ┆ … ┆ 0.095817 ┆ 2.067933 ┆ 0.057999 ┆ 0.283257  │
│ 7     ┆ 8   ┆ 2016 ┆ 5     ┆ … ┆ 0.211089 ┆ 0.997663 ┆ 0.985377 ┆ 1.674629  │
│ 8     ┆ 8   ┆ 2016 ┆ 9     ┆ … ┆ 0.023346 ┆ 1.191023 ┆ 1.347318 ┆ 1.71358   │
│ 9     ┆ 2   ┆ 2018 ┆ 11    ┆ … ┆ 0.005274 ┆ 0.359494 ┆ 2.237734 ┆ 1.160059  │
└───────┴─────┴──────┴───────┴───┴──────────┴──────────┴──────────┴───────────┘

logger.info("Use the stat calculator class to get stats")
logger.info("   The basic option, just get some stats")
stats = statistics = Statistics(
    stats=["mean", "median|not0"], columns=["v_1", "income", "income_2"]
)

logger.info("What is available:")
logger.info(Statistics.available_stats())

Use the stat calculator class to get stats

   The basic option, just get some stats

What is available:

Some examples: ['mean', 'sum', 'median', 'q10', 'q97.5', 'std', 'var', 'max', 'min', 'weight', 'n', 'gini']

Stats can also have 'modifiers' appended to them separated by a pipe ('|'), including

['not0', 'missing', 'notmissing', 'is0', 'share']

For quantiles, pass q{number} where number in (0,100)

logger.info("Estimate them")
sc = StatCalculator(
    df,
    statistics=stats,
    weight="weight_0",
)

Estimate them

┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆      5.149512 ┆              6 │
│   income ┆ 44,315.163945 ┆         49,134 │
│ income_2 ┆ 60,360.890519 ┆         60,477 │
└──────────┴───────────────┴────────────────┘

logger.info("   Want standard errors?")
logger.info("   Set up 'Replicates'")
logger.info("       from data (with df)")
logger.info("       NOTE:   if pass 'bootstrap=True', it will calculate ")
logger.info(
    "               regular bootstrap SEs rather than balanced repeated replication SEs"
)
logger.info("               (if you don't know what that means, pass bootstrap=True)")
replicates_from_df = Replicates(df=df, weight_stub="weight_", bootstrap=True)

logger.info("       or just tell it the number (n_replicates)")
replicates = Replicates(
    weight_stub="weight_", n_replicates=n_replicates, bootstrap=True
)
logger.info("       Same either way")
logger.info(replicates_from_df.rep_list)
logger.info(replicates.rep_list)
del replicates_from_df

   Want standard errors?

   Set up 'Replicates'

       from data (with df)

       NOTE:   if pass 'bootstrap=True', it will calculate

               regular bootstrap SEs rather than balanced repeated replication SEs

               (if you don't know what that means, pass bootstrap=True)

       or just tell it the number (n_replicates)

       Same either way

['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']

['weight_0', 'weight_1', 'weight_2', 'weight_3', 'weight_4', 'weight_5', 'weight_6', 'weight_7', 'weight_8', 'weight_9', 'weight_10']

logger.info("\n\nNow let's calculate the stats with replicate-weighted standard errors")
logger.info("   for df")
logger.info("   (where each estimate has the SE below the estimate)")
sc = StatCalculator(df, statistics=stats, weight="weight_0", replicates=replicates)

logger.info("   for df_compare")
sc_compare = StatCalculator(
    df_compare, statistics=stats, weight="weight_0", replicates=replicates
)


logger.info("Save them for later")
sc.save(f"{config.path_temp_files}/sc")
sc_compare.save(f"{config.path_temp_files}/sc_compare")

del sc
del sc_compare


Now let's calculate the stats with replicate-weighted standard errors

   for df

   (where each estimate has the SE below the estimate)

0

.

.

.

.

5

.

logger.info("Load them back up")
sc = StatCalculator.load(f"{config.path_temp_files}/sc")
sc_compare = StatCalculator.load(f"{config.path_temp_files}/sc_compare")

Load them back up

logger.info("\n\nWe can compare them easily")
d_comparisons = sc.compare(sc_compare)

logger.info(
    "Which gives us a dictionary of comparisons, with keys=['difference','ratio']"
)
logger.info(d_comparisons["difference"])
logger.info(d_comparisons["ratio"])


We can compare them easily

Comparing estimates using bootstrap weights

  Difference

┌──────────┬───────────────┬────────────────┐
│ Variable ┆          mean ┆ median (not 0) │
╞══════════╪═══════════════╪════════════════╡
│      v_1 ┆     -0.360617 ┆           -1.0 │
│          ┆      0.097138 ┆       0.489898 │
│   income ┆ -1,932.846898 ┆          565.0 │
│          ┆  1,300.122127 ┆   2,985.945706 │
│ income_2 ┆  1,533.806424 ┆        4,400.0 │
│          ┆  1,527.354628 ┆   3,275.316047 │
└──────────┴───────────────┴────────────────┘

  Ratio

┌──────────┬───────────┬────────────────┐
│ Variable ┆      mean ┆ median (not 0) │
╞══════════╪═══════════╪════════════════╡
│      v_1 ┆ -0.070029 ┆      -0.166667 │
│          ┆  0.018631 ┆        0.08165 │
│   income ┆ -0.043616 ┆       0.011499 │
│          ┆  0.030181 ┆       0.056944 │
│ income_2 ┆  0.025411 ┆       0.072755 │
│          ┆  0.025124 ┆       0.053336 │
└──────────┴───────────┴────────────────┘

Which gives us a dictionary of comparisons, with keys=['difference','ratio']

<survey_kit.statistics.calculator.StatCalculator object at 0x00000274B1112F90>

logger.info(
    "\n\nLet's just get the difference between the means to medians, within the same data"
)
logger.info("   For df")
sc_mean_median = sc.compare(
    sc,
    ratio=False,
    compare_list_columns=[
        ComparisonItem.Column("mean", "median (not 0)", name="median_mean")
    ],
)["difference"]

logger.info("   For df_compare")
sc_compare_mean_median = sc_compare.compare(
    sc_compare,
    ratio=False,
    compare_list_columns=[
        ComparisonItem.Column("mean", "median (not 0)", name="median_mean")
    ],
)["difference"]


Let's just get the difference between the means to medians, within the same data

   For df

Comparing estimates using bootstrap weights

  Difference

┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆     0.850488 │
│          ┆     0.362602 │
│   income ┆ 4,818.836055 │
│          ┆ 2,440.054996 │
│ income_2 ┆   116.109481 │
│          ┆ 1,138.679381 │
└──────────┴──────────────┘

   For df_compare

Comparing estimates using bootstrap weights

  Difference

┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆     0.211105 │
│          ┆      0.35478 │
│   income ┆ 7,316.682953 │
│          ┆  1,329.96631 │
│ income_2 ┆ 2,982.303057 │
│          ┆ 1,409.876125 │
└──────────┴──────────────┘

logger.info(
    "\n\nNow, we can use the same tool to get a difference in difference comparison"
)
logger.info(
    "   i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]"
)
d_diff_in_diffs = sc_mean_median.compare(sc_compare_mean_median, ratio=False)


Now, we can use the same tool to get a difference in difference comparison

   i.e. [df(median|not0) - df(mean)] - [df_compare(median|not0) - df_compare(mean)]

Comparing estimates using bootstrap weights

  Difference

┌──────────┬──────────────┐
│ Variable ┆  median_mean │
╞══════════╪══════════════╡
│      v_1 ┆    -0.639383 │
│          ┆     0.433448 │
│   income ┆ 2,497.846898 │
│          ┆ 3,137.994513 │
│ income_2 ┆ 2,866.193576 │
│          ┆  2,125.69276 │
└──────────┴──────────────┘

logger.info("\n\nLet's get the ratio two variables")
logger.info(
    "   rather than the difference between two statistics for the same variable)"
)

logger.info("   For df")
sc_inc_inc_2 = sc.compare(
    sc,
    difference=False,
    compare_list_variables=[
        ComparisonItem.Variable(value1="income", value2="income_2", name="income_comp")
    ],
)["ratio"]

logger.info("   For df_compare")
sc_inc_inc_2_compare = sc_compare.compare(
    sc_compare,
    difference=False,
    compare_list_variables=[
        ComparisonItem.Variable(value1="income", value2="income_2", name="income_comp")
    ],
)["ratio"]


Let's get the ratio two variables

   rather than the difference between two statistics for the same variable)

   For df

Comparing estimates using bootstrap weights

  Ratio

┌─────────────┬──────────┬────────────────┐
│    Variable ┆     mean ┆ median (not 0) │
╞═════════════╪══════════╪════════════════╡
│ income_comp ┆ 0.362082 ┆       0.230858 │
│             ┆ 0.027344 ┆       0.055648 │
└─────────────┴──────────┴────────────────┘

   For df_compare

Comparing estimates using bootstrap weights

  Ratio

logger.info("\n\nAnd for fun, the diff-in-diff")
logger.info(
    "   i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]"
)

d_diff_in_diffs_2 = sc_inc_inc_2.compare(sc_inc_inc_2_compare, ratio=False)


And for fun, the diff-in-diff

   i.e. [df(income_2)/df(income)] - [df_compare(income_2)/df_compare(income)]

Comparing estimates using bootstrap weights

  Difference

┌─────────────┬──────────┬────────────────┐
│    Variable ┆     mean ┆ median (not 0) │
╞═════════════╪══════════╪════════════════╡
│ income_comp ┆ 0.098308 ┆        0.07454 │
│             ┆ 0.047934 ┆       0.081155 │
└─────────────┴──────────┴────────────────┘