from pathlib import Path
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.formula_builder import FormulaBuilder
from survey_kit.calibration.moment import Moment
from survey_kit.calibration.calibration import Calibration
from survey_kit.utilities.dataframe import summary
import narwhals as nw
from survey_kit import logger

logger.info("Generating data for weighting")
n_rows = 100_000
df_population = (
    RandomData(n_rows=n_rows, seed=12332151)
    .index("index")
    .integer("v_1", 1, 10)
    .np_distribution("v_f_continuous_0", "normal", loc=10, scale=2)
    .np_distribution("v_f_continuous_1", "normal", loc=10, scale=2)
    .np_distribution("v_f_continuous_2", "normal", loc=10, scale=2)
    .float("v_extra", -1, 2)
    .np_distribution("weight_0", "normal", loc=10, scale=1)
    .np_distribution("weight_1", "normal", loc=10, scale=1)
    .integer("year", 2016, 2021)
    .integer("month", 1, 12)
    .to_df()
    .lazy()
)

df_treatment = (
    RandomData(n_rows=n_rows, seed=894654)
    .index("index")
    .integer("v_1", 1, 10)
    #   Intentionally set the loc/scale as different than above
    .np_distribution("v_f_continuous_0", "normal", loc=11, scale=4)
    .np_distribution("v_f_continuous_1", "normal", loc=11, scale=4)
    .np_distribution("v_f_continuous_2", "normal", loc=11, scale=4)
    .float("v_extra", -1, 2)
    .np_distribution("weight_0", "normal", loc=10, scale=1)
    .np_distribution("weight_1", "normal", loc=10, scale=1)
    .integer("year", 2016, 2021)
    .integer("month", 1, 12)
    .to_df()
    .lazy()
)

# print(df.describe())

Generating data for weighting

logger.info("Weighting 'function'")
f = FormulaBuilder(df=df_population, constant=False)
f.continuous(columns=["v_1", "v_f_continuous_*", "v_f_p2_*"])
#   f.simple_interaction(columns=["v_1","v_f_continuous_0"])

logger.info("Define the target moments that the weighting will match")
logger.info("   This can be a dataset or a single row of pop controls")
m = Moment(
    df=df_population,
    formula=f.formula,
    weight="weight_0",
    index="index",
    #    by=["year"],
    rescale=True,
)

logger.info("You can save/reload moments if you want")
# m.save("/my/path/moment")
# m_loaded = Moment.load("/my/path/moment")

Weighting 'function'

Define the target moments that the weighting will match

   This can be a dataset or a single row of pop controls

You can save/reload moments if you want

#   Calibrate the data in df_treatment to the moment above
c = Calibration(
    df=df_treatment, moments=m, weight="weight_1", final_weight="weight_final"
)

c.run(
    #   Drop a moment if there are too few observations
    min_obs=5,
    # If it fails to converge, set bounds on the weights
    #   final weights = (base*ratio) where the bounds are on the ratio
    #   for "best possible" weights
    bounds=(0.001, 1000),
)

#   Merge the final weights back on the treatment data
df_treatment = c.get_final_weights(df_treatment)

Aggregating any sub_moments

Calibrating weights using aebw

      min obs = 5

     Calibration using combined moments

Entropy Balance Rewighting, Sanders (2024)

Input matrix is sparse? False

Problem Size: 100000 rows, 5 moments

  #    Criterion     ||Eq. Const.||  ||FOC Lagr.|| PrimalStepSize  DualStepSize  Opt. Violation.

  0     0.000000      17465.610380       0.0000          inf            inf      17465.61037995

  1   1945.423027     7092.371751       46.6445     123.27371263    2.06592760   7092.52513345

logger.info("'Population' estimates")
_ = summary(df_population, weight="weight_0")

'Population' estimates

┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,987.162827 ┆ 28,870.556017 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆      5.484124 ┆      2.875588 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      9.989362 ┆      1.996633 ┆  1.491748 ┆ 18.835062 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     10.002072 ┆      2.006889 ┆   1.37638 ┆  18.82769 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      9.998039 ┆      2.004505 ┆  1.166252 ┆ 19.254231 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.504297 ┆      0.867154 ┆ -0.999978 ┆  1.999996 │
│         weight_1 ┆ 100,000 ┆           0 ┆     10.004412 ┆      1.002786 ┆   5.35507 ┆ 14.023032 │
│             year ┆ 100,000 ┆           0 ┆  2,018.493399 ┆      1.706467 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆      6.504686 ┆       3.44999 ┆       1.0 ┆      12.0 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘

logger.info("\n\n'Treatment', original weights")
_ = summary(df_treatment, weight="weight_1")


'Treatment', original weights

┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,997.423141 ┆ 28,873.734839 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆       5.51228 ┆      2.868293 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      10.99576 ┆      4.013016 ┆ -5.459849 ┆ 30.287625 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     11.014155 ┆      3.999313 ┆ -7.360353 ┆ 27.415263 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      11.00231 ┆      4.005346 ┆ -8.478953 ┆ 27.102181 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.500786 ┆      0.867307 ┆ -0.999996 ┆  1.999988 │
│         weight_0 ┆ 100,000 ┆           0 ┆      9.997927 ┆      1.001265 ┆   5.26969 ┆ 14.090873 │
│             year ┆ 100,000 ┆           0 ┆  2,018.500692 ┆      1.707684 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆      6.489706 ┆      3.447587 ┆       1.0 ┆      12.0 │
│     weight_final ┆ 100,000 ┆           0 ┆      1.010022 ┆      0.475068 ┆  0.149575 ┆   7.45808 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘

logger.info("\n\n'Treatment', calibrated")
_ = summary(df_treatment, weight="weight_final")


'Treatment', calibrated

┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,953.704786 ┆ 28,879.488768 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆      5.484124 ┆      2.868654 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      9.989362 ┆      4.007166 ┆ -5.459849 ┆ 30.287625 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     10.002072 ┆      4.006294 ┆ -7.360353 ┆ 27.415263 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      9.998039 ┆      4.007545 ┆ -8.478953 ┆ 27.102181 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.502229 ┆      0.867483 ┆ -0.999996 ┆  1.999988 │
│         weight_0 ┆ 100,000 ┆           0 ┆      9.999153 ┆      1.000994 ┆   5.26969 ┆ 14.090873 │
│         weight_1 ┆ 100,000 ┆           0 ┆     10.102295 ┆      0.998432 ┆  5.539069 ┆ 14.227871 │
│             year ┆ 100,000 ┆           0 ┆  2,018.501971 ┆      1.708979 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆       6.49219 ┆      3.448097 ┆       1.0 ┆      12.0 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘