import sys
import os
from pathlib import Path

import narwhals as nw
import polars as pl
import polars.selectors as cs

from survey_kit.utilities.random import RandomData
from survey_kit.utilities.dataframe import summary

from survey_kit.imputation.variable import Variable
from survey_kit.imputation.parameters import Parameters
from survey_kit.imputation.srmi import SRMI
from survey_kit.imputation.selection import Selection

from survey_kit import logger, config
from survey_kit.utilities.dataframe import summary, columns_from_list
from survey_kit.utilities.formula_builder import FormulaBuilder


path = Path(config.code_root)
sys.path.append(os.path.normpath(path.parent.parent / "tests"))
from scratch import path_scratch


config.data_root = path_scratch(temp_file_suffix=False)

# Draw some random data

n_rows = 10_000
impute_share = 0.25


df = (
    RandomData(n_rows=n_rows, seed=32565437)
    .index("index")
    .integer("year", 2016, 2020)
    .integer("month", 1, 12)
    .integer("var2", 0, 10)
    .integer("var3", 0, 50)
    .float("var4", 0, 1)
    .integer("var5", 0, 1)
    .float("unrelated_1", 0, 1)
    .float("unrelated_2", 0, 1)
    .float("unrelated_3", 0, 1)
    .float("unrelated_4", 0, 1)
    .float("unrelated_5", 0, 1)
    .np_distribution("epsilon_reg1", "normal", scale=5)
    .np_distribution("epsilon_reg2", "normal", scale=5)
    .float("missing_reg1", 0, 1)
    .float("missing_reg2", 0, 1)
    .to_df()
)


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")

c_e_reg1 = pl.col("epsilon_reg1")
c_e_reg2 = pl.col("epsilon_reg2")


#   Convenience references to them for creating dependent variables
c_var2 = pl.col("var2")
c_var3 = pl.col("var3")
c_var4 = pl.col("var4")
c_var5 = pl.col("var5")


logger.info("var_reg1 is binary and conditional on other variables")
c_reg1 = ((c_var2 * 2 - c_var3 * 3 * c_var5 + c_e_reg1)  > 0).alias("var_reg1")

logger.info("var_reg2 is != 0 only if var_reg1 == True")
c_reg2 = (
    pl.when(pl.col("var_reg1"))
      .then(((c_var2 * 1.5 - c_var3 * 1 * c_var4 + c_e_reg2)))
      .otherwise(pl.lit(0))
      .alias("var_reg2")
)
#   Create a bunch of variables that are functions of the variables created above
df = (
    df.with_columns(c_reg1)
    .with_columns(c_reg2)
    .drop(columns_from_list(df=df, columns="epsilon*"))
    .with_row_index(name="_row_index_")
)

df_original = df

#   Set variables to missing according to the uniform random variables missing_
clear_missing = []
for prefixi in ["reg"]:
    for i in range(1, 3):
        vari = f"var_{prefixi}{i}"
        missingi = f"missing_{prefixi}{i}"

        clear_missing.append(
            pl.when(pl.col(missingi) < impute_share)
            .then(pl.lit(None))
            .otherwise(pl.col(vari))
            .alias(vari)
        )
df = df.with_columns(clear_missing).drop(cs.starts_with("missing_"))

#   Make a fully collinear var for testing
df = df.with_columns(pl.col("unrelated_1").alias("repeat_1"))


summary(df)


#   Actually do the imputation

var_reg1 is binary and conditional on other variables

var_reg2 is != 0 only if var_reg1 == True

┌─────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│    Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞═════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│       index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│       month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│        var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│        var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│        var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│        var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│ unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│ unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│ unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│ unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│ unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│    var_reg1 ┆ 10,000 ┆       2,569 ┆   0.523079 ┆    0.499501 ┆        0.0 ┆       1.0 │
│    var_reg2 ┆ 10,000 ┆       2,476 ┆  -2.353019 ┆   10.232008 ┆ -55.354108 ┆ 26.213084 │
│    repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
└─────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘

logger.info("Define the regression model (intentionally include some extraneous variables")

f_model = FormulaBuilder(df=df)
f_model.formula_with_varnames_in_brackets(
    "~1+{var_*}+var2+var4+var4*var3*C(var5)+{unrelated_*}+{repeat_*}"
)
logger.info(f_model.formula)

Define the regression model (intentionally include some extraneous variables

~1+var_reg1+var_reg2+var2+var4+var4*var3*C(var5)+unrelated_1+unrelated_2+unrelated_3+unrelated_4+unrelated_5+repeat_1

# Set up the variable to be imputed
vars_impute = []

logger.info("Impute the boolean variable (var_reg1)")
logger.info("   to the default setup for predicted mean matching")
logger.info("   using logit regression")
v_reg1 = Variable(
    impute_var="var_reg1",
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    parameters=Parameters.Regression(model=Parameters.RegressionModel.Logit)
)
logger.info("Add the variable to the list to be imputed")
vars_impute.append(v_reg1)

logger.info("Impute the continuous variable (var_reg2) ")
logger.info("   conditional on var_reg1, using narwhals (nw.col('var_reg1'))")
logger.info("   by setting the model type")
logger.info("   and the formula")
logger.info("   as well as a post-processing edit to set var_reg2=0 when var_reg1==0")
v_reg2 = Variable(
    impute_var="var_reg2",
    Where=nw.col("var_reg1"),
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    #   Default parameters
    parameters=Parameters.Regression(),
    postFunctions=(
        nw.when(nw.col("var_reg1"))
          .then(nw.col("var_reg2"))
          .otherwise(nw.lit(0))
          .alias("var_reg2")
    )
)

vars_impute.append(v_reg2)

Impute the boolean variable (var_reg1)

   to the default setup for predicted mean matching

   using logit regression

Add the variable to the list to be imputed

Impute the continuous variable (var_reg2)

   conditional on var_reg1, using narwhals (nw.col('var_reg1'))

   by setting the model type

   and the formula

   as well as a post-processing edit to set var_reg2=0 when var_reg1==0

logger.info("Set up the imputation")
logger.info("Add LASSO selection before each imputation")
srmi = SRMI(
    df=df,
    variables=vars_impute,
    n_implicates=2,
    n_iterations=2,
    parallel=False,
    selection=Selection(method=Selection.Method.LASSO),
    modeltype=Variable.ModelType.pmm,
    model=f_model.formula,
    bayesian_bootstrap=True,
    path_model=f"{path_scratch()}/py_srmi_test_regression",
    force_start=True,
)

Set up the imputation

Add LASSO selection before each imputation

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi

Dropping var_reg1 from formula

Dropping var_reg2 from formula

logger.info("Run it")
srmi.run()

Run it

Variable selection before SRMI run, if necessary

     var_reg1: Method.No

     var_reg2: Method.No

Hyperparameter tuning before SRMI run, if necessary

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/1.srmi.implicate

Removing existing directory C:\Users\jonro\OneDrive\Documents\Coding\survey_kit\.scratch\temp_files/py_srmi_test_regression.srmi/2.srmi.implicate

Dropping var_reg2 from formula

     Running variable selection: Method.LASSO

         Selected model: ~0+var2+C(var5)+repeat_1+unrelated_5+unrelated_3+unrelated_1+var3:C(var5)+var4:C(var5)+var4:var3:C(var5)+var3+var4+var4+var3

logger.info("Get the results")
_ = df_list = srmi.df_implicates

Get the results

logger.info("\n\nLook at the original")
_ = summary(df_original)

logger.info("\n\nLook at the imputes")
_ = df_list.pipe(summary)

logger.info("\n\nLook at the imputes | var_reg1 == 0")
_ = df_list.filter(~nw.col("var_reg1")).pipe(summary)

logger.info("\n\nLook at the imputes | var_reg1 == 1")
_ = df_list.filter(nw.col("var_reg1")).pipe(summary)


Look at the original

┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│ missing_reg1 ┆ 10,000 ┆           0 ┆   0.495784 ┆    0.289051 ┆   0.000161 ┆   0.99991 │
│ missing_reg2 ┆ 10,000 ┆           0 ┆   0.502597 ┆    0.288468 ┆   0.000006 ┆  0.999963 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5229 ┆      0.4995 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆  -2.402477 ┆   10.331745 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘


Look at the imputes

┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│     repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5216 ┆    0.499558 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆  -2.180802 ┆    9.844918 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘

┌──────────────┬────────┬─────────────┬────────────┬─────────────┬────────────┬───────────┐
│     Variable ┆      n ┆ n (missing) ┆       mean ┆         std ┆        min ┆       max │
╞══════════════╪════════╪═════════════╪════════════╪═════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│        index ┆ 10,000 ┆           0 ┆    4,999.5 ┆ 2,886.89568 ┆        0.0 ┆   9,999.0 │
│         year ┆ 10,000 ┆           0 ┆ 2,017.9851 ┆    1.415937 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 10,000 ┆           0 ┆     6.5137 ┆    3.432141 ┆        1.0 ┆      12.0 │
│         var2 ┆ 10,000 ┆           0 ┆     4.9782 ┆    3.154508 ┆        0.0 ┆      10.0 │
│         var3 ┆ 10,000 ┆           0 ┆    25.1084 ┆   14.752302 ┆        0.0 ┆      50.0 │
│         var4 ┆ 10,000 ┆           0 ┆   0.505666 ┆    0.287861 ┆   0.000027 ┆  0.999997 │
│         var5 ┆ 10,000 ┆           0 ┆     0.4999 ┆    0.500025 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│  unrelated_2 ┆ 10,000 ┆           0 ┆   0.500105 ┆    0.287638 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 10,000 ┆           0 ┆   0.499175 ┆     0.28876 ┆   0.000129 ┆   0.99994 │
│  unrelated_4 ┆ 10,000 ┆           0 ┆   0.500655 ┆    0.288698 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 10,000 ┆           0 ┆    0.49876 ┆    0.288979 ┆   0.000071 ┆  0.999867 │
│     repeat_1 ┆ 10,000 ┆           0 ┆   0.502449 ┆    0.288359 ┆   0.000119 ┆  0.999997 │
│     var_reg1 ┆ 10,000 ┆           0 ┆     0.5213 ┆    0.499571 ┆        0.0 ┆       1.0 │
│     var_reg2 ┆ 10,000 ┆           0 ┆   -2.23131 ┆    9.939851 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴────────┴─────────────┴────────────┴─────────────┴────────────┴───────────┘


Look at the imputes | var_reg1 == 0

┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐
│     Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆      min ┆      max │
╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡
│ ___rownumber ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│  _row_index_ ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│        index ┆ 4,784 ┆           0 ┆ 5,033.632734 ┆ 2,891.610847 ┆      1.0 ┆  9,997.0 │
│         year ┆ 4,784 ┆           0 ┆  2,017.99561 ┆     1.420329 ┆  2,016.0 ┆  2,020.0 │
│        month ┆ 4,784 ┆           0 ┆     6.572742 ┆     3.407715 ┆      1.0 ┆     12.0 │
│         var2 ┆ 4,784 ┆           0 ┆     4.628135 ┆     3.233716 ┆      0.0 ┆     10.0 │
│         var3 ┆ 4,784 ┆           0 ┆    24.966137 ┆    13.395857 ┆      0.0 ┆     50.0 │
│         var4 ┆ 4,784 ┆           0 ┆     0.507489 ┆     0.289287 ┆ 0.000027 ┆ 0.999997 │
│         var5 ┆ 4,784 ┆           0 ┆     0.886288 ┆     0.317495 ┆      0.0 ┆      1.0 │
│  unrelated_1 ┆ 4,784 ┆           0 ┆      0.50384 ┆     0.286077 ┆ 0.000119 ┆ 0.999921 │
│  unrelated_2 ┆ 4,784 ┆           0 ┆     0.499727 ┆     0.285433 ┆ 0.000079 ┆ 0.999528 │
│  unrelated_3 ┆ 4,784 ┆           0 ┆     0.501752 ┆     0.289139 ┆ 0.000139 ┆  0.99994 │
│  unrelated_4 ┆ 4,784 ┆           0 ┆      0.50239 ┆     0.289704 ┆ 0.000141 ┆ 0.999961 │
│  unrelated_5 ┆ 4,784 ┆           0 ┆     0.501731 ┆     0.290909 ┆ 0.000071 ┆  0.99984 │
│     repeat_1 ┆ 4,784 ┆           0 ┆      0.50384 ┆     0.286077 ┆ 0.000119 ┆ 0.999921 │
│     var_reg1 ┆ 4,784 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
│     var_reg2 ┆ 4,784 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
└──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘

┌──────────────┬───────┬─────────────┬──────────────┬──────────────┬──────────┬──────────┐
│     Variable ┆     n ┆ n (missing) ┆         mean ┆          std ┆      min ┆      max │
╞══════════════╪═══════╪═════════════╪══════════════╪══════════════╪══════════╪══════════╡
│ ___rownumber ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│  _row_index_ ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│        index ┆ 4,787 ┆           0 ┆ 5,029.025695 ┆ 2,905.488834 ┆      1.0 ┆  9,997.0 │
│         year ┆ 4,787 ┆           0 ┆  2,017.99248 ┆     1.421562 ┆  2,016.0 ┆  2,020.0 │
│        month ┆ 4,787 ┆           0 ┆     6.585126 ┆     3.414898 ┆      1.0 ┆     12.0 │
│         var2 ┆ 4,787 ┆           0 ┆     4.609359 ┆     3.225133 ┆      0.0 ┆     10.0 │
│         var3 ┆ 4,787 ┆           0 ┆    24.976603 ┆    13.404733 ┆      0.0 ┆     50.0 │
│         var4 ┆ 4,787 ┆           0 ┆     0.508052 ┆     0.288739 ┆ 0.000027 ┆ 0.999997 │
│         var5 ┆ 4,787 ┆           0 ┆     0.885941 ┆     0.317916 ┆      0.0 ┆      1.0 │
│  unrelated_1 ┆ 4,787 ┆           0 ┆     0.503982 ┆     0.285823 ┆ 0.000119 ┆ 0.999921 │
│  unrelated_2 ┆ 4,787 ┆           0 ┆     0.499459 ┆     0.285463 ┆ 0.000079 ┆ 0.999528 │
│  unrelated_3 ┆ 4,787 ┆           0 ┆     0.501814 ┆     0.289014 ┆ 0.000139 ┆  0.99994 │
│  unrelated_4 ┆ 4,787 ┆           0 ┆     0.500316 ┆      0.28914 ┆ 0.000141 ┆ 0.999961 │
│  unrelated_5 ┆ 4,787 ┆           0 ┆     0.501014 ┆      0.29115 ┆ 0.000071 ┆  0.99984 │
│     repeat_1 ┆ 4,787 ┆           0 ┆     0.503982 ┆     0.285823 ┆ 0.000119 ┆ 0.999921 │
│     var_reg1 ┆ 4,787 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
│     var_reg2 ┆ 4,787 ┆           0 ┆          0.0 ┆          0.0 ┆      0.0 ┆      0.0 │
└──────────────┴───────┴─────────────┴──────────────┴──────────────┴──────────┴──────────┘


Look at the imputes | var_reg1 == 1

┌──────────────┬───────┬─────────────┬─────────────┬──────────────┬────────────┬───────────┐
│     Variable ┆     n ┆ n (missing) ┆        mean ┆          std ┆        min ┆       max │
╞══════════════╪═══════╪═════════════╪═════════════╪══════════════╪════════════╪═══════════╡
│ ___rownumber ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│  _row_index_ ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│        index ┆ 5,216 ┆           0 ┆ 4,968.19421 ┆ 2,882.486117 ┆        0.0 ┆   9,999.0 │
│         year ┆ 5,216 ┆           0 ┆ 2,017.97546 ┆     1.411965 ┆    2,016.0 ┆   2,020.0 │
│        month ┆ 5,216 ┆           0 ┆    6.459548 ┆     3.453831 ┆        1.0 ┆      12.0 │
│         var2 ┆ 5,216 ┆           0 ┆    5.299271 ┆     3.045196 ┆        0.0 ┆      10.0 │
│         var3 ┆ 5,216 ┆           0 ┆    25.23888 ┆    15.895128 ┆        0.0 ┆      50.0 │
│         var4 ┆ 5,216 ┆           0 ┆    0.503993 ┆     0.286564 ┆   0.000104 ┆  0.999885 │
│         var5 ┆ 5,216 ┆           0 ┆    0.145514 ┆     0.352652 ┆        0.0 ┆       1.0 │
│  unrelated_1 ┆ 5,216 ┆           0 ┆    0.501172 ┆     0.290459 ┆   0.000248 ┆  0.999997 │
│  unrelated_2 ┆ 5,216 ┆           0 ┆    0.500452 ┆     0.289673 ┆   0.000049 ┆  0.999539 │
│  unrelated_3 ┆ 5,216 ┆           0 ┆    0.496812 ┆     0.288419 ┆   0.000129 ┆  0.999622 │
│  unrelated_4 ┆ 5,216 ┆           0 ┆    0.499064 ┆     0.287791 ┆   0.000133 ┆  0.999972 │
│  unrelated_5 ┆ 5,216 ┆           0 ┆    0.496036 ┆     0.287198 ┆   0.000181 ┆  0.999867 │
│     repeat_1 ┆ 5,216 ┆           0 ┆    0.501172 ┆     0.290459 ┆   0.000248 ┆  0.999997 │
│     var_reg1 ┆ 5,216 ┆           0 ┆         1.0 ┆          0.0 ┆        1.0 ┆       1.0 │
│     var_reg2 ┆ 5,216 ┆           0 ┆   -4.180985 ┆    13.321794 ┆ -55.354108 ┆ 26.213084 │
└──────────────┴───────┴─────────────┴─────────────┴──────────────┴────────────┴───────────┘