In [1]:
import polars as pl
from survey_kit_data.bls.cex import cex
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.utilities.dataframe_list import DataFrameList
from survey_kit.statistics.multiple_imputation import mi_ses_from_function
from survey_kit.statistics.calculator import StatCalculator
from survey_kit.statistics.statistics import Statistics
from survey_kit.statistics.replicates import Replicates
from survey_kit import logger
In [2]:
logger.info("Download the CEX (using in-development survey-kit-data package)")
d_cex = cex(2023)
logger.info("Work with the family file from Q2 2023")
df_fml = d_cex["interview_fmli232"]
df_fml = df_fml.select(pl.all().name.to_lowercase())
Download the CEX (using in-development survey-kit-data package)
Loading from cached data
Loading from cached data
Loading from cached data
Work with the family file from Q2 2023
In [3]:
logger.info("Rename the weights so they're easier to work with")
df_fml = df_fml.with_columns(pl.col("finlwt21").alias("wtrep00")).rename(
{f"wtrep0{i}": f"wtrep{i}" for i in range(10)}
)
Rename the weights so they're easier to work with
In [4]:
logger.info("Put the weights in their own file")
df_weights = (
df_fml.select(columns_from_list(df_fml, ["newid", "wtrep*"]))
.fill_null(0)
.collect()
.lazy()
)
Put the weights in their own file
In [5]:
logger.info("Create 5 implicate files with the fsalary variable")
df_salary = DataFrameList(
[
df_fml.select(["newid", pl.col(f"fsalary{i}").alias("fsalary")])
.collect()
.lazy()
for i in range(1, 6)
]
)
_ = df_salary.pipe(summary)
Create 5 implicate files with the fsalary variable
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡ │ newid ┆ 4,751 ┆ 0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │ │ fsalary ┆ 4,751 ┆ 0 ┆ 74,624.49358 ┆ 92,729.366885 ┆ 0 ┆ 596,941 │ └──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡ │ newid ┆ 4,751 ┆ 0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │ │ fsalary ┆ 4,751 ┆ 0 ┆ 74,330.296148 ┆ 92,538.918118 ┆ 0 ┆ 596,941 │ └──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡ │ newid ┆ 4,751 ┆ 0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │ │ fsalary ┆ 4,751 ┆ 0 ┆ 74,650.094717 ┆ 92,585.289144 ┆ 0 ┆ 596,941 │ └──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡ │ newid ┆ 4,751 ┆ 0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │ │ fsalary ┆ 4,751 ┆ 0 ┆ 73,996.612924 ┆ 92,211.313746 ┆ 0 ┆ 596,941 │ └──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
┌──────────┬───────┬─────────────┬──────────────────┬───────────────┬───────────┬───────────┐ │ Variable ┆ n ┆ n (missing) ┆ mean ┆ std ┆ min ┆ max │ ╞══════════╪═══════╪═════════════╪══════════════════╪═══════════════╪═══════════╪═══════════╡ │ newid ┆ 4,751 ┆ 0 ┆ 5,237,971.630394 ┆ 83,970.849517 ┆ 5,090,604 ┆ 5,366,911 │ │ fsalary ┆ 4,751 ┆ 0 ┆ 74,656.227952 ┆ 92,728.316341 ┆ 0 ┆ 596,941 │ └──────────┴───────┴─────────────┴──────────────────┴───────────────┴───────────┴───────────┘
In [6]:
logger.info("What statistics do I want:")
logger.info(" In this case, mean and some percentiles of fsalary")
stats = Statistics(
stats=["mean", "p25", "p50", "p75"],
columns="fsalary",
)
What statistics do I want:
In this case, mean and some percentiles of fsalary
In [7]:
logger.info(
"Define the 'replicate' object, which tell is what the weight variables are"
)
replicates = Replicates(weight_stub="wtrep", df=df_weights, bootstrap=False)
Define the 'replicate' object, which tell is what the weight variables are
In [8]:
logger.info("Arguments that are getting passed to StatCalculator at each run")
arguments = dict(statistics=stats, replicates=replicates)
Arguments that are getting passed to StatCalculator at each run
In [9]:
logger.info("Get the multiple imputation standard errofs by calling StatCalculator")
logger.info(" for each implicate and each replicate factor")
mi_results_seq = mi_ses_from_function(
delegate=StatCalculator,
df_implicates=df_salary,
df_noimputes=df_weights,
index=["newid"],
arguments=arguments,
join_on=["Variable"],
parallel=False,
)
logger.info("\n\nMI Salary Statistics")
mi_results_seq.print()
Get the multiple imputation standard errofs by calling StatCalculator
for each implicate and each replicate factor
Implicate #1
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡ │ fsalary ┆ 76,730.534733 ┆ 605.0 ┆ 51,000.0 ┆ 112,000.0 │ │ ┆ 2,179.422314 ┆ 749.429116 ┆ 2,347.10487 ┆ 3,675.637718 │ └──────────┴───────────────┴────────────┴─────────────┴──────────────┘
Implicate #2
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬──────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪══════════════╪══════════════╡ │ fsalary ┆ 76,357.198723 ┆ 589.0 ┆ 50,513.0 ┆ 110,000.0 │ │ ┆ 2,127.624822 ┆ 665.066709 ┆ 2,432.255556 ┆ 4,362.443852 │ └──────────┴───────────────┴────────────┴──────────────┴──────────────┘
Implicate #3
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡ │ fsalary ┆ 76,665.233977 ┆ 700.0 ┆ 51,147.0 ┆ 112,000.0 │ │ ┆ 2,091.233478 ┆ 730.400513 ┆ 1,931.75556 ┆ 3,742.759349 │ └──────────┴───────────────┴────────────┴─────────────┴──────────────┘
Implicate #4
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬──────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪══════════════╪══════════════╡ │ fsalary ┆ 76,049.387968 ┆ 672.0 ┆ 51,000.0 ┆ 110,000.0 │ │ ┆ 2,045.904634 ┆ 759.715317 ┆ 2,308.704261 ┆ 3,839.347257 │ └──────────┴───────────────┴────────────┴──────────────┴──────────────┘
Implicate #5
0
.
.
.
.
5
.
.
.
.
10
.
.
.
.
15
.
.
.
.
20
.
.
.
.
25
.
.
.
.
30
.
.
.
.
35
.
.
.
.
40
.
.
.
.
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡ │ fsalary ┆ 76,815.872639 ┆ 500.0 ┆ 51,000.0 ┆ 112,140.0 │ │ ┆ 1,937.968821 ┆ 630.407732 ┆ 1,932.57245 ┆ 3,894.589079 │ └──────────┴───────────────┴────────────┴─────────────┴──────────────┘
MI Salary Statistics
┌──────────┬───────────────┬────────────┬─────────────┬──────────────┐ │ Variable ┆ mean ┆ p25 ┆ p50 ┆ p75 │ ╞══════════╪═══════════════╪════════════╪═════════════╪══════════════╡ │ fsalary ┆ 76,523.645608 ┆ 613.2 ┆ 50,932.0 ┆ 111,228.0 │ │ ┆ 2,106.783483 ┆ 713.962479 ┆ 2,216.97481 ┆ 4,099.204996 │ └──────────┴───────────────┴────────────┴─────────────┴──────────────┘