In [1]:
import polars as pl
from survey_kit_data.bls.cex import cex
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.utilities.dataframe_list import DataFrameList

from survey_kit.statistics.multiple_imputation import mi_ses_from_function
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates


from survey_kit import logger
In [2]:
logger.info("Download the CEX (using in-development survey-kit-data package)")
d_cex = cex(2023)


logger.info("Work with the family file from Q2 2023")
df_fml = d_cex["interview_fmli232"]
df_fml = df_fml.select(pl.all().name.to_lowercase())
Download the CEX (using in-development survey-kit-data package)
Loading from cached data
Loading from cached data
Loading from cached data
Work with the family file from Q2 2023
In [3]:
logger.info("Rename the weights so they're easier to work with")
df_fml = df_fml.with_columns(pl.col("finlwt21").alias("wtrep00")).rename(
    {f"wtrep0{i}": f"wtrep{i}" for i in range(10)}
)
Rename the weights so they're easier to work with
In [4]:
logger.info("Put the weights in their own file")
df_weights = (
    df_fml.select(columns_from_list(df_fml, ["newid", "wtrep*"]))
    .fill_null(0)
    .collect()
    .lazy()
)
Put the weights in their own file
In [5]:
logger.info("Create 5 implicate files with the fsalary variable")
df_salary = DataFrameList(
    [
        df_fml.select(["newid", pl.col(f"fsalary{i}").alias("fsalary")])
        .collect()
        .lazy()
        for i in range(1, 6)
    ]
)

_ = df_salary.pipe(summary)
Create 5 implicate files with the fsalary variable
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐
│ Variable ┆     n ┆ n (missing) ┆             mean ┆           std ┆       min ┆       max │
╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡
│    newid ┆ 4,751 ┆           0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │
│  fsalary ┆ 4,751 ┆           0 ┆     74,624.49358 ┆ 92,729.366885 ┆         0 ┆   596,941 │
└──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐
│ Variable ┆     n ┆ n (missing) ┆             mean ┆           std ┆       min ┆       max │
╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡
│    newid ┆ 4,751 ┆           0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │
│  fsalary ┆ 4,751 ┆           0 ┆    74,330.296148 ┆ 92,538.918118 ┆         0 ┆   596,941 │
└──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐
│ Variable ┆     n ┆ n (missing) ┆             mean ┆           std ┆       min ┆       max │
╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡
│    newid ┆ 4,751 ┆           0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │
│  fsalary ┆ 4,751 ┆           0 ┆    74,650.094717 ┆ 92,585.289144 ┆         0 ┆   596,941 │
└──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐
│ Variable ┆     n ┆ n (missing) ┆             mean ┆           std ┆       min ┆       max │
╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡
│    newid ┆ 4,751 ┆           0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │
│  fsalary ┆ 4,751 ┆           0 ┆    73,996.612924 ┆ 92,211.313746 ┆         0 ┆   596,941 │
└──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐
│ Variable ┆     n ┆ n (missing) ┆             mean ┆           std ┆       min ┆       max │
╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡
│    newid ┆ 4,751 ┆           0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │
│  fsalary ┆ 4,751 ┆           0 ┆    74,656.227952 ┆ 92,728.316341 ┆         0 ┆   596,941 │
└──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
In [6]:
logger.info("What statistics do I want:")
logger.info("   In this case, mean and some percentiles of fsalary")
stats = Statistics(
    stats=["mean", "p25", "p50", "p75"],
    columns="fsalary",
)
What statistics do I want:
   In this case, mean and some percentiles of fsalary
In [7]:
logger.info(
    "Define the 'replicate' object, which tell is what the weight variables are"
)
replicates = Replicates(weight_stub="wtrep", df=df_weights, bootstrap=False)
Define the 'replicate' object, which tell is what the weight variables are
In [8]:
logger.info("Arguments that are getting passed to StatCalculator at each run")
arguments = dict(statistics=stats, replicates=replicates)
Arguments that are getting passed to StatCalculator at each run
In [9]:
logger.info("Get the multiple imputation standard errofs by calling StatCalculator")
logger.info("   for each implicate and each replicate factor")
mi_results_seq = mi_ses_from_function(
    delegate=StatCalculator,
    df_implicates=df_salary,
    df_noimputes=df_weights,
    index=["newid"],
    arguments=arguments,
    join_on=["Variable"],
    parallel=False,
)

logger.info("\n\nMI Salary Statistics")
mi_results_seq.print()
Get the multiple imputation standard errofs by calling StatCalculator
   for each implicate and each replicate factor
Implicate #1
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆         p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡
│  fsalary ┆ 76,730.534733 ┆      605.0 ┆    51,000.0 ┆    112,000.0 │
│          ┆  2,179.422314 ┆ 749.429116 ┆ 2,347.10487 ┆ 3,675.637718 │
└──────────┴───────────────┴────────────┴─────────────┴──────────────┘
Implicate #2
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.

30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬──────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆          p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪══════════════╪══════════════╡
│  fsalary ┆ 76,357.198723 ┆      589.0 ┆     50,513.0 ┆    110,000.0 │
│          ┆  2,127.624822 ┆ 665.066709 ┆ 2,432.255556 ┆ 4,362.443852 │
└──────────┴───────────────┴────────────┴──────────────┴──────────────┘
Implicate #3
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.

.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆         p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡
│  fsalary ┆ 76,665.233977 ┆      700.0 ┆    51,147.0 ┆    112,000.0 │
│          ┆  2,091.233478 ┆ 730.400513 ┆ 1,931.75556 ┆ 3,742.759349 │
└──────────┴───────────────┴────────────┴─────────────┴──────────────┘
Implicate #4
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25

.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬──────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆          p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪══════════════╪══════════════╡
│  fsalary ┆ 76,049.387968 ┆      672.0 ┆     51,000.0 ┆    110,000.0 │
│          ┆  2,045.904634 ┆ 759.715317 ┆ 2,308.704261 ┆ 3,839.347257 │
└──────────┴───────────────┴────────────┴──────────────┴──────────────┘
Implicate #5
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.

.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆         p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡
│  fsalary ┆ 76,815.872639 ┆      500.0 ┆    51,000.0 ┆    112,140.0 │
│          ┆  1,937.968821 ┆ 630.407732 ┆ 1,932.57245 ┆ 3,894.589079 │
└──────────┴───────────────┴────────────┴─────────────┴──────────────┘

MI Salary Statistics
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐
│ Variable ┆          mean ┆        p25 ┆         p50 ┆          p75 │
╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡
│  fsalary ┆ 76,523.645608 ┆      613.2 ┆    50,932.0 ┆    111,228.0 │
│          ┆  2,106.783483 ┆ 713.962479 ┆ 2,216.97481 ┆ 4,099.204996 │
└──────────┴───────────────┴────────────┴─────────────┴──────────────┘