In [1]:
from pathlib import Path
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.formula_builder import FormulaBuilder
from survey_kit.calibration.moment import Moment
from survey_kit.calibration.calibration import Calibration
from survey_kit.utilities.dataframe import summary
import narwhals as nw
from survey_kit import logger
In [2]:
logger.info("Generating data for weighting")
n_rows = 100_000
df_population = (
RandomData(n_rows=n_rows, seed=12332151)
.index("index")
.integer("v_1", 1, 10)
.np_distribution("v_f_continuous_0", "normal", loc=10, scale=2)
.np_distribution("v_f_continuous_1", "normal", loc=10, scale=2)
.np_distribution("v_f_continuous_2", "normal", loc=10, scale=2)
.float("v_extra", -1, 2)
.np_distribution("weight_0", "normal", loc=10, scale=1)
.np_distribution("weight_1", "normal", loc=10, scale=1)
.integer("year", 2016, 2021)
.integer("month", 1, 12)
.to_df()
.lazy()
)
df_treatment = (
RandomData(n_rows=n_rows, seed=894654)
.index("index")
.integer("v_1", 1, 10)
# Intentionally set the loc/scale as different than above
.np_distribution("v_f_continuous_0", "normal", loc=11, scale=4)
.np_distribution("v_f_continuous_1", "normal", loc=11, scale=4)
.np_distribution("v_f_continuous_2", "normal", loc=11, scale=4)
.float("v_extra", -1, 2)
.np_distribution("weight_0", "normal", loc=10, scale=1)
.np_distribution("weight_1", "normal", loc=10, scale=1)
.integer("year", 2016, 2021)
.integer("month", 1, 12)
.to_df()
.lazy()
)
# print(df.describe())
Generating data for weighting
In [3]:
logger.info("Weighting 'function'")
f = FormulaBuilder(df=df_population, constant=False)
f.continuous(columns=["v_1", "v_f_continuous_*", "v_f_p2_*"])
# f.simple_interaction(columns=["v_1","v_f_continuous_0"])
logger.info("Define the target moments that the weighting will match")
logger.info(" This can be a dataset or a single row of pop controls")
m = Moment(
df=df_population,
formula=f.formula,
weight="weight_0",
index="index",
# by=["year"],
rescale=True,
)
logger.info("You can save/reload moments if you want")
# m.save("/my/path/moment")
# m_loaded = Moment.load("/my/path/moment")
Weighting 'function'
Define the target moments that the weighting will match
This can be a dataset or a single row of pop controls
You can save/reload moments if you want
In [4]:
# Calibrate the data in df_treatment to the moment above
c = Calibration(
df=df_treatment,
moments=m,
weight="weight_1",
final_weight="weight_final"
)
c.run(
# Drop a moment if there are too few observations
min_obs=5,
# If it fails to converge, set bounds on the weights
# final weights = (base*ratio) where the bounds are on the ratio
# for "best possible" weights
bounds=(0.001, 1000)
)
# Merge the final weights back on the treatment data
df_treatment = c.get_final_weights(df_treatment)
Aggregating any sub_moments
Calibrating weights using aebw
min obs = 5
Calibration using combined moments
Entropy Balance Rewighting, Sanders (2024)
Input matrix is sparse? False
Problem Size: 100000 rows, 5 moments
# Criterion ||Eq. Const.|| ||FOC Lagr.|| PrimalStepSize DualStepSize Opt. Violation.
0 0.000000 17465.610380 0.0000 inf inf 17465.61037995
1 1945.423027 7092.371751 46.6445 123.27371263 2.06592760 7092.52513345
2 9247.126302 68.148229 10.2235 70.89880835 0.25970726 68.91081991
3 9477.094839 1.179172 0.9164 12.66666907 0.05747635 1.49338233
4 9472.014246 0.006919 0.2327 0.94332784 0.00277490 0.23283020
5 9471.976595 0.000009 0.0234 0.03116427 0.00000998 0.02340470
6 9471.976505 0.000000 0.0003 0.00342021 0.00000076 0.00026967
7 9471.976506 0.000000 0.0000 0.00004033 0.00000001 0.00000004
Optimality converged.
Time elapsed: 0.11071370000718161
Optimization completed, success?: True
Converged: True
Maximum Difference: 3.6637359812630166e-15
shape: (4, 8) ┌─────────────────────┬──────────┬─────────┬───────────┬────────────┬─────────┬─────────────┬─────────────┐ │ Variable ┆ Initial ┆ Targets ┆ Estimates ┆ Calibrated ┆ NonZero ┆ diff ┆ percent │ ╞═════════════════════╪══════════╪═════════╪═══════════╪════════════╪═════════╪═════════════╪═════════════╡ │ m0_v_f_continuous_0 ┆ 1.100747 ┆ 1 ┆ 1.0 ┆ 1 ┆ 100000 ┆ -3.6637e-15 ┆ -3.6637e-13 │ │ m0_v_f_continuous_1 ┆ 1.101187 ┆ 1 ┆ 1.0 ┆ 1 ┆ 100000 ┆ 2.8866e-15 ┆ 2.8866e-13 │ │ m0_v_f_continuous_2 ┆ 1.100447 ┆ 1 ┆ 1.0 ┆ 1 ┆ 100000 ┆ -2.6645e-15 ┆ -2.6645e-13 │ │ m0_v_1 ┆ 1.005134 ┆ 1 ┆ 1.0 ┆ 1 ┆ 100000 ┆ -3.3307e-16 ┆ -3.3307e-14 │ └─────────────────────┴──────────┴─────────┴───────────┴────────────┴─────────┴─────────────┴─────────────┘
In [5]:
logger.info("'Population' estimates")
_ = summary(df_population,
weight="weight_0")
'Population' estimates
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡ │ index ┆ 100,000 ┆ 0 ┆ 49,987.162827 ┆ 28,870.556017 ┆ 0.0 ┆ 99,999.0 │ │ v_1 ┆ 100,000 ┆ 0 ┆ 5.484124 ┆ 2.875588 ┆ 1.0 ┆ 10.0 │ │ v_f_continuous_0 ┆ 100,000 ┆ 0 ┆ 9.989362 ┆ 1.996633 ┆ 1.491748 ┆ 18.835062 │ │ v_f_continuous_1 ┆ 100,000 ┆ 0 ┆ 10.002072 ┆ 2.006889 ┆ 1.37638 ┆ 18.82769 │ │ v_f_continuous_2 ┆ 100,000 ┆ 0 ┆ 9.998039 ┆ 2.004505 ┆ 1.166252 ┆ 19.254231 │ │ v_extra ┆ 100,000 ┆ 0 ┆ 0.504297 ┆ 0.867154 ┆ -0.999978 ┆ 1.999996 │ │ weight_1 ┆ 100,000 ┆ 0 ┆ 10.004412 ┆ 1.002786 ┆ 5.35507 ┆ 14.023032 │ │ year ┆ 100,000 ┆ 0 ┆ 2,018.493399 ┆ 1.706467 ┆ 2,016.0 ┆ 2,021.0 │ │ month ┆ 100,000 ┆ 0 ┆ 6.504686 ┆ 3.44999 ┆ 1.0 ┆ 12.0 │ └──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘
In [6]:
logger.info("\n\n'Treatment', original weights")
_ = summary(df_treatment,
weight="weight_1")
'Treatment', original weights
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡ │ index ┆ 100,000 ┆ 0 ┆ 49,997.423141 ┆ 28,873.734839 ┆ 0.0 ┆ 99,999.0 │ │ v_1 ┆ 100,000 ┆ 0 ┆ 5.51228 ┆ 2.868293 ┆ 1.0 ┆ 10.0 │ │ v_f_continuous_0 ┆ 100,000 ┆ 0 ┆ 10.99576 ┆ 4.013016 ┆ -5.459849 ┆ 30.287625 │ │ v_f_continuous_1 ┆ 100,000 ┆ 0 ┆ 11.014155 ┆ 3.999313 ┆ -7.360353 ┆ 27.415263 │ │ v_f_continuous_2 ┆ 100,000 ┆ 0 ┆ 11.00231 ┆ 4.005346 ┆ -8.478953 ┆ 27.102181 │ │ v_extra ┆ 100,000 ┆ 0 ┆ 0.500786 ┆ 0.867307 ┆ -0.999996 ┆ 1.999988 │ │ weight_0 ┆ 100,000 ┆ 0 ┆ 9.997927 ┆ 1.001265 ┆ 5.26969 ┆ 14.090873 │ │ year ┆ 100,000 ┆ 0 ┆ 2,018.500692 ┆ 1.707684 ┆ 2,016.0 ┆ 2,021.0 │ │ month ┆ 100,000 ┆ 0 ┆ 6.489706 ┆ 3.447587 ┆ 1.0 ┆ 12.0 │ │ weight_final ┆ 100,000 ┆ 0 ┆ 1.010022 ┆ 0.475068 ┆ 0.149575 ┆ 7.45808 │ └──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘
In [7]:
logger.info("\n\n'Treatment', calibrated")
_ = summary(df_treatment,
weight="weight_final")
'Treatment', calibrated
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡ │ index ┆ 100,000 ┆ 0 ┆ 49,953.704786 ┆ 28,879.488768 ┆ 0.0 ┆ 99,999.0 │ │ v_1 ┆ 100,000 ┆ 0 ┆ 5.484124 ┆ 2.868654 ┆ 1.0 ┆ 10.0 │ │ v_f_continuous_0 ┆ 100,000 ┆ 0 ┆ 9.989362 ┆ 4.007166 ┆ -5.459849 ┆ 30.287625 │ │ v_f_continuous_1 ┆ 100,000 ┆ 0 ┆ 10.002072 ┆ 4.006294 ┆ -7.360353 ┆ 27.415263 │ │ v_f_continuous_2 ┆ 100,000 ┆ 0 ┆ 9.998039 ┆ 4.007545 ┆ -8.478953 ┆ 27.102181 │ │ v_extra ┆ 100,000 ┆ 0 ┆ 0.502229 ┆ 0.867483 ┆ -0.999996 ┆ 1.999988 │ │ weight_0 ┆ 100,000 ┆ 0 ┆ 9.999153 ┆ 1.000994 ┆ 5.26969 ┆ 14.090873 │ │ weight_1 ┆ 100,000 ┆ 0 ┆ 10.102295 ┆ 0.998432 ┆ 5.539069 ┆ 14.227871 │ │ year ┆ 100,000 ┆ 0 ┆ 2,018.501971 ┆ 1.708979 ┆ 2,016.0 ┆ 2,021.0 │ │ month ┆ 100,000 ┆ 0 ┆ 6.49219 ┆ 3.448097 ┆ 1.0 ┆ 12.0 │ └──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘