In [1]:
from pathlib import Path
from survey_kit.utilities.random import RandomData
from survey_kit.utilities.formula_builder import FormulaBuilder
from survey_kit.calibration.moment import Moment
from survey_kit.calibration.calibration import Calibration
from survey_kit.utilities.dataframe import summary
import narwhals as nw
from survey_kit import logger
In [2]:
logger.info("Generating data for weighting")
n_rows = 100_000
df_population = (
    RandomData(n_rows=n_rows, seed=12332151)
    .index("index")
    .integer("v_1", 1, 10)
    .np_distribution("v_f_continuous_0", "normal", loc=10, scale=2)
    .np_distribution("v_f_continuous_1", "normal", loc=10, scale=2)
    .np_distribution("v_f_continuous_2", "normal", loc=10, scale=2)
    .float("v_extra", -1, 2)
    .np_distribution("weight_0", "normal", loc=10, scale=1)
    .np_distribution("weight_1", "normal", loc=10, scale=1)
    .integer("year", 2016, 2021)
    .integer("month", 1, 12)
    .to_df()
    .lazy()
)

df_treatment = (
    RandomData(n_rows=n_rows, seed=894654)
    .index("index")
    .integer("v_1", 1, 10)
    #   Intentionally set the loc/scale as different than above
    .np_distribution("v_f_continuous_0", "normal", loc=11, scale=4)
    .np_distribution("v_f_continuous_1", "normal", loc=11, scale=4)
    .np_distribution("v_f_continuous_2", "normal", loc=11, scale=4)
    .float("v_extra", -1, 2)
    .np_distribution("weight_0", "normal", loc=10, scale=1)
    .np_distribution("weight_1", "normal", loc=10, scale=1)
    .integer("year", 2016, 2021)
    .integer("month", 1, 12)
    .to_df()
    .lazy()
)

# print(df.describe())
Generating data for weighting
In [3]:
logger.info("Weighting 'function'")
f = FormulaBuilder(df=df_population, constant=False)
f.continuous(columns=["v_1", "v_f_continuous_*", "v_f_p2_*"])
#   f.simple_interaction(columns=["v_1","v_f_continuous_0"])

logger.info("Define the target moments that the weighting will match")
logger.info("   This can be a dataset or a single row of pop controls")
m = Moment(
    df=df_population,
    formula=f.formula,
    weight="weight_0",
    index="index",
    #    by=["year"],
    rescale=True,
)

logger.info("You can save/reload moments if you want")
# m.save("/my/path/moment")
# m_loaded = Moment.load("/my/path/moment")
Weighting 'function'
Define the target moments that the weighting will match
   This can be a dataset or a single row of pop controls
You can save/reload moments if you want
In [4]:
#   Calibrate the data in df_treatment to the moment above
c = Calibration(
    df=df_treatment,
    moments=m,
    weight="weight_1",
    final_weight="weight_final"
)

c.run(
    #   Drop a moment if there are too few observations
    min_obs=5,
    # If it fails to converge, set bounds on the weights 
    #   final weights = (base*ratio) where the bounds are on the ratio
    #   for "best possible" weights   
    bounds=(0.001, 1000)
)

#   Merge the final weights back on the treatment data
df_treatment = c.get_final_weights(df_treatment)
Aggregating any sub_moments
Calibrating weights using aebw
      min obs = 5
     Calibration using combined moments
Entropy Balance Rewighting, Sanders (2024)
Input matrix is sparse? False
Problem Size: 100000 rows, 5 moments
  #    Criterion     ||Eq. Const.||  ||FOC Lagr.|| PrimalStepSize  DualStepSize  Opt. Violation.
  0     0.000000      17465.610380       0.0000          inf            inf      17465.61037995
  1   1945.423027     7092.371751       46.6445     123.27371263    2.06592760   7092.52513345 
  2   9247.126302      68.148229        10.2235      70.89880835    0.25970726    68.91081991  
  3   9477.094839       1.179172         0.9164      12.66666907    0.05747635     1.49338233  
  4   9472.014246       0.006919         0.2327      0.94332784     0.00277490     0.23283020  
  5   9471.976595       0.000009         0.0234      0.03116427     0.00000998     0.02340470  
  6   9471.976505       0.000000         0.0003      0.00342021     0.00000076     0.00026967  
  7   9471.976506       0.000000         0.0000      0.00004033     0.00000001     0.00000004  
Optimality converged.
Time elapsed: 0.11071370000718161
Optimization completed, success?: True

Converged:          True
Maximum Difference: 3.6637359812630166e-15
shape: (4, 8)
┌─────────────────────┬──────────┬─────────┬───────────┬────────────┬─────────┬─────────────┬─────────────┐
│            Variable ┆  Initial ┆ Targets ┆ Estimates ┆ Calibrated ┆ NonZero ┆        diff ┆     percent │
╞═════════════════════╪══════════╪═════════╪═══════════╪════════════╪═════════╪═════════════╪═════════════╡
│ m0_v_f_continuous_0 ┆ 1.100747 ┆       1 ┆       1.0 ┆          1 ┆  100000 ┆ -3.6637e-15 ┆ -3.6637e-13 │
│ m0_v_f_continuous_1 ┆ 1.101187 ┆       1 ┆       1.0 ┆          1 ┆  100000 ┆  2.8866e-15 ┆  2.8866e-13 │
│ m0_v_f_continuous_2 ┆ 1.100447 ┆       1 ┆       1.0 ┆          1 ┆  100000 ┆ -2.6645e-15 ┆ -2.6645e-13 │
│              m0_v_1 ┆ 1.005134 ┆       1 ┆       1.0 ┆          1 ┆  100000 ┆ -3.3307e-16 ┆ -3.3307e-14 │
└─────────────────────┴──────────┴─────────┴───────────┴────────────┴─────────┴─────────────┴─────────────┘
In [5]:
logger.info("'Population' estimates")
_ = summary(df_population,
        weight="weight_0")
'Population' estimates
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,987.162827 ┆ 28,870.556017 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆      5.484124 ┆      2.875588 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      9.989362 ┆      1.996633 ┆  1.491748 ┆ 18.835062 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     10.002072 ┆      2.006889 ┆   1.37638 ┆  18.82769 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      9.998039 ┆      2.004505 ┆  1.166252 ┆ 19.254231 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.504297 ┆      0.867154 ┆ -0.999978 ┆  1.999996 │
│         weight_1 ┆ 100,000 ┆           0 ┆     10.004412 ┆      1.002786 ┆   5.35507 ┆ 14.023032 │
│             year ┆ 100,000 ┆           0 ┆  2,018.493399 ┆      1.706467 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆      6.504686 ┆       3.44999 ┆       1.0 ┆      12.0 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘
In [6]:
logger.info("\n\n'Treatment', original weights")
_ = summary(df_treatment,
        weight="weight_1")

'Treatment', original weights
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,997.423141 ┆ 28,873.734839 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆       5.51228 ┆      2.868293 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      10.99576 ┆      4.013016 ┆ -5.459849 ┆ 30.287625 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     11.014155 ┆      3.999313 ┆ -7.360353 ┆ 27.415263 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      11.00231 ┆      4.005346 ┆ -8.478953 ┆ 27.102181 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.500786 ┆      0.867307 ┆ -0.999996 ┆  1.999988 │
│         weight_0 ┆ 100,000 ┆           0 ┆      9.997927 ┆      1.001265 ┆   5.26969 ┆ 14.090873 │
│             year ┆ 100,000 ┆           0 ┆  2,018.500692 ┆      1.707684 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆      6.489706 ┆      3.447587 ┆       1.0 ┆      12.0 │
│     weight_final ┆ 100,000 ┆           0 ┆      1.010022 ┆      0.475068 ┆  0.149575 ┆   7.45808 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘
In [7]:
logger.info("\n\n'Treatment', calibrated")
_ = summary(df_treatment,
        weight="weight_final")

'Treatment', calibrated
┌──────────────────┬─────────┬─────────────┬───────────────┬───────────────┬───────────┬───────────┐
│         Variable ┆       n ┆ n (missing) ┆          mean ┆           std ┆       min ┆       max │
╞══════════════════╪═════════╪═════════════╪═══════════════╪═══════════════╪═══════════╪═══════════╡
│            index ┆ 100,000 ┆           0 ┆ 49,953.704786 ┆ 28,879.488768 ┆       0.0 ┆  99,999.0 │
│              v_1 ┆ 100,000 ┆           0 ┆      5.484124 ┆      2.868654 ┆       1.0 ┆      10.0 │
│ v_f_continuous_0 ┆ 100,000 ┆           0 ┆      9.989362 ┆      4.007166 ┆ -5.459849 ┆ 30.287625 │
│ v_f_continuous_1 ┆ 100,000 ┆           0 ┆     10.002072 ┆      4.006294 ┆ -7.360353 ┆ 27.415263 │
│ v_f_continuous_2 ┆ 100,000 ┆           0 ┆      9.998039 ┆      4.007545 ┆ -8.478953 ┆ 27.102181 │
│          v_extra ┆ 100,000 ┆           0 ┆      0.502229 ┆      0.867483 ┆ -0.999996 ┆  1.999988 │
│         weight_0 ┆ 100,000 ┆           0 ┆      9.999153 ┆      1.000994 ┆   5.26969 ┆ 14.090873 │
│         weight_1 ┆ 100,000 ┆           0 ┆     10.102295 ┆      0.998432 ┆  5.539069 ┆ 14.227871 │
│             year ┆ 100,000 ┆           0 ┆  2,018.501971 ┆      1.708979 ┆   2,016.0 ┆   2,021.0 │
│            month ┆ 100,000 ┆           0 ┆       6.49219 ┆      3.448097 ┆       1.0 ┆      12.0 │
└──────────────────┴─────────┴─────────────┴───────────────┴───────────────┴───────────┴───────────┘