Regression on p_outlier
¶
This small tutorial illustrates how we can define a regression directly on the p_outlier
parameter.
The way we define the regression follows the basic pattern we establish in the include
argument.
Load Modules¶
In [ ]:
Copied!
import hssm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import hssm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Simulate some data¶
In [ ]:
Copied!
# synth data
n_participants = 10
conditions = ['switch', 'noswitch','someswitch']
n_samples_per_condition = 250
v_by_participant = np.random.normal(loc=0.0, scale=0.5, size=n_participants)
v_displacement_by_condition = {'switch': 1.0, 'noswitch': 0., 'someswitch': -1.0}
a_true = 1.0
z_true = 0.5
t_true = 0.5
dfs = []
for participant in range(n_participants):
for condition in conditions:
tmp_df = hssm.simulate_data(model = 'ddm', theta = dict(v = v_by_participant[participant] + v_displacement_by_condition[condition],
a = a_true,
z = z_true,
t = t_true), size = n_samples_per_condition)
tmp_df['true_v'] = v_by_participant[participant] + v_displacement_by_condition[condition]
tmp_df['true_a'] = a_true
tmp_df['true_z'] = z_true
tmp_df['true_t'] = t_true
tmp_df['participant_id'] = str(participant)
tmp_df['trialtype'] = condition
dfs.append(tmp_df)
data_df = pd.concat(dfs).reset_index(drop=True)
# synth data
n_participants = 10
conditions = ['switch', 'noswitch','someswitch']
n_samples_per_condition = 250
v_by_participant = np.random.normal(loc=0.0, scale=0.5, size=n_participants)
v_displacement_by_condition = {'switch': 1.0, 'noswitch': 0., 'someswitch': -1.0}
a_true = 1.0
z_true = 0.5
t_true = 0.5
dfs = []
for participant in range(n_participants):
for condition in conditions:
tmp_df = hssm.simulate_data(model = 'ddm', theta = dict(v = v_by_participant[participant] + v_displacement_by_condition[condition],
a = a_true,
z = z_true,
t = t_true), size = n_samples_per_condition)
tmp_df['true_v'] = v_by_participant[participant] + v_displacement_by_condition[condition]
tmp_df['true_a'] = a_true
tmp_df['true_z'] = z_true
tmp_df['true_t'] = t_true
tmp_df['participant_id'] = str(participant)
tmp_df['trialtype'] = condition
dfs.append(tmp_df)
data_df = pd.concat(dfs).reset_index(drop=True)
Inject Uniform noise into rts
to simulate outliers¶
In [ ]:
Copied!
# Inject noise
p_outlier_noise = np.random.uniform(0, 0.15, size = n_participants)
for i in range(n_participants):
# Get indices of trials to inject noise
p_outlier_indices = np.random.choice(data_df[data_df['participant_id'] == str(i)].index,
size = int(p_outlier_noise[i] * \
len(data_df[data_df['participant_id'] == str(i)])),
replace = False)
# Inject noise
data_df.loc[p_outlier_indices, "rt"] = np.random.uniform(lower = 0., upper = 20.,
size = data_df.loc[p_outlier_indices, "rt"].shape)
# Inject noise
p_outlier_noise = np.random.uniform(0, 0.15, size = n_participants)
for i in range(n_participants):
# Get indices of trials to inject noise
p_outlier_indices = np.random.choice(data_df[data_df['participant_id'] == str(i)].index,
size = int(p_outlier_noise[i] * \
len(data_df[data_df['participant_id'] == str(i)])),
replace = False)
# Inject noise
data_df.loc[p_outlier_indices, "rt"] = np.random.uniform(lower = 0., upper = 20.,
size = data_df.loc[p_outlier_indices, "rt"].shape)
Define and sample from HSSM model¶
In [ ]:
Copied!
test_model_1_C = hssm.HSSM(data_df,
model = "ddm",
loglik_kind = "approx_differentiable",
include = [{"name": "v", "formula": "v ~ 0 + (1 + C(trialtype) |participant_id)"}],
p_outlier = {"formula": "p_outlier ~ 1 + (1|C(participant_id))",
"prior": {"Intercept": {"name": "Normal", "mu": 0, "sigma": 1}},
"link": "logit"
},
)
test_model_1_C = hssm.HSSM(data_df,
model = "ddm",
loglik_kind = "approx_differentiable",
include = [{"name": "v", "formula": "v ~ 0 + (1 + C(trialtype) |participant_id)"}],
p_outlier = {"formula": "p_outlier ~ 1 + (1|C(participant_id))",
"prior": {"Intercept": {"name": "Normal", "mu": 0, "sigma": 1}},
"link": "logit"
},
)
No common intercept. Bounds for parameter v is not applied due to a current limitation of Bambi. This will change in the future. No common intercept. Bounds for parameter p_outlier is not applied due to a current limitation of Bambi. This will change in the future. Model initialized successfully.
In [ ]:
Copied!
test_model_1_C.sample(
sampler="nuts_numpyro", # type of sampler to choose, 'nuts_numpyro', 'nuts_blackjax' of default pymc nuts sampler
cores=2, # how many cores to use
chains=2, # how many chains to run
draws=500, # number of draws from the markov chain
tune=500, # number of burn-in samples
idata_kwargs=dict(log_likelihood=True), # return log likelihood
)
test_model_1_C.sample(
sampler="nuts_numpyro", # type of sampler to choose, 'nuts_numpyro', 'nuts_blackjax' of default pymc nuts sampler
cores=2, # how many cores to use
chains=2, # how many chains to run
draws=500, # number of draws from the markov chain
tune=500, # number of burn-in samples
idata_kwargs=dict(log_likelihood=True), # return log likelihood
)
Using default initvals.
0%| | 0/1000 [00:00<?, ?it/s]
0%| | 0/1000 [00:00<?, ?it/s]
There were 550 divergences after tuning. Increase `target_accept` or reparameterize. We recommend running at least 4 chains for robust computation of convergence diagnostics The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details The effective sample size per chain is smaller than 100 for some parameters. A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details 100%|██████████| 1000/1000 [00:02<00:00, 352.68it/s]
Out[ ]:
arviz.InferenceData
-
<xarray.Dataset> Size: 732kB Dimensions: (chain: 2, draw: 500, participant_id__factor_dim: 10, v_C(trialtype)|participant_id__expr_dim: 2, v_1|participant_id__factor_dim: 10, v_C(trialtype)|participant_id__factor_dim: 10) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 ... 498 499 * participant_id__factor_dim (participant_id__factor_dim) <U1 40B ... * v_1|participant_id__factor_dim (v_1|participant_id__factor_dim) <U1 40B ... * v_C(trialtype)|participant_id__factor_dim (v_C(trialtype)|participant_id__factor_dim) <U1 40B ... * v_C(trialtype)|participant_id__expr_dim (v_C(trialtype)|participant_id__expr_dim) <U10 80B ... Data variables: (12/15) p_outlier_1|participant_id_offset (chain, draw, participant_id__factor_dim) float64 80kB ... v_C(trialtype)|participant_id_mu (chain, draw, v_C(trialtype)|participant_id__expr_dim) float64 16kB ... p_outlier_1|participant_id_sigma (chain, draw) float64 8kB 0.03... v_1|participant_id (chain, draw, v_1|participant_id__factor_dim) float64 80kB ... v_1|participant_id_sigma (chain, draw) float64 8kB 0.34... v_C(trialtype)|participant_id (chain, draw, v_C(trialtype)|participant_id__expr_dim, v_C(trialtype)|participant_id__factor_dim) float64 160kB ... ... ... a (chain, draw) float64 8kB 1.00... t (chain, draw) float64 8kB 0.50... v_1|participant_id_offset (chain, draw, v_1|participant_id__factor_dim) float64 80kB ... p_outlier_1|participant_id (chain, draw, participant_id__factor_dim) float64 80kB ... v_C(trialtype)|participant_id_offset (chain, draw, v_C(trialtype)|participant_id__expr_dim, v_C(trialtype)|participant_id__factor_dim) float64 160kB ... v_1|participant_id_mu (chain, draw) float64 8kB 7.58... Attributes: created_at: 2025-03-04T23:05:02.491148+00:00 arviz_version: 0.19.0 inference_library: numpyro inference_library_version: 0.16.1 sampling_time: 801.139236 tuning_steps: 500 modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 60MB Dimensions: (chain: 2, draw: 500, __obs__: 7500) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499 * __obs__ (__obs__) int64 60kB 0 1 2 3 4 5 ... 7495 7496 7497 7498 7499 Data variables: rt,response (chain, draw, __obs__) float64 60MB -4.925 -0.3124 ... -1.842 Attributes: modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 53kB Dimensions: (chain: 2, draw: 500) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 2 3 4 5 6 ... 494 495 496 497 498 499 Data variables: acceptance_rate (chain, draw) float64 8kB 0.9826 0.06897 ... 0.9448 0.8806 diverging (chain, draw) bool 1kB True True True ... False True False energy (chain, draw) float64 8kB 9.937e+03 9.936e+03 ... 9.946e+03 lp (chain, draw) float64 8kB 9.912e+03 9.912e+03 ... 9.909e+03 n_steps (chain, draw) int64 8kB 58 233 135 19 17 ... 74 511 21 127 step_size (chain, draw) float64 8kB 0.03672 0.03672 ... 0.02792 tree_depth (chain, draw) int64 8kB 6 8 8 5 5 4 7 8 ... 6 7 5 8 7 9 5 7 Attributes: created_at: 2025-03-04T23:05:02.538727+00:00 arviz_version: 0.19.0 modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 180kB Dimensions: (__obs__: 7500, rt,response_extra_dim_0: 2) Coordinates: * __obs__ (__obs__) int64 60kB 0 1 2 3 ... 7497 7498 7499 * rt,response_extra_dim_0 (rt,response_extra_dim_0) int64 16B 0 1 Data variables: rt,response (__obs__, rt,response_extra_dim_0) float64 120kB ... Attributes: created_at: 2025-03-04T23:05:02.539979+00:00 arviz_version: 0.19.0 inference_library: numpyro inference_library_version: 0.16.1 sampling_time: 801.139236 tuning_steps: 500 modeling_interface: bambi modeling_interface_version: 0.15.0
In [ ]:
Copied!
# Add tiral-wise parameters to idata (since we define p_outlier as a regression)
test_model_1_C.add_likelihood_parameters_to_idata(inplace=True)
# Add tiral-wise parameters to idata (since we define p_outlier as a regression)
test_model_1_C.add_likelihood_parameters_to_idata(inplace=True)
Out[ ]:
arviz.InferenceData
-
<xarray.Dataset> Size: 121MB Dimensions: (chain: 2, draw: 500, participant_id__factor_dim: 10, v_C(trialtype)|participant_id__expr_dim: 2, v_1|participant_id__factor_dim: 10, v_C(trialtype)|participant_id__factor_dim: 10, __obs__: 7500) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 ... 498 499 * participant_id__factor_dim (participant_id__factor_dim) <U1 40B ... * v_1|participant_id__factor_dim (v_1|participant_id__factor_dim) <U1 40B ... * v_C(trialtype)|participant_id__factor_dim (v_C(trialtype)|participant_id__factor_dim) <U1 40B ... * v_C(trialtype)|participant_id__expr_dim (v_C(trialtype)|participant_id__expr_dim) <U10 80B ... * __obs__ (__obs__) int64 60kB 0 1 ... 7499 Data variables: (12/17) p_outlier_1|participant_id_offset (chain, draw, participant_id__factor_dim) float64 80kB ... v_C(trialtype)|participant_id_mu (chain, draw, v_C(trialtype)|participant_id__expr_dim) float64 16kB ... p_outlier_1|participant_id_sigma (chain, draw) float64 8kB 0.03... v_1|participant_id (chain, draw, v_1|participant_id__factor_dim) float64 80kB ... v_1|participant_id_sigma (chain, draw) float64 8kB 0.34... v_C(trialtype)|participant_id (chain, draw, v_C(trialtype)|participant_id__expr_dim, v_C(trialtype)|participant_id__factor_dim) float64 160kB ... ... ... v_1|participant_id_offset (chain, draw, v_1|participant_id__factor_dim) float64 80kB ... p_outlier_1|participant_id (chain, draw, participant_id__factor_dim) float64 80kB ... v_C(trialtype)|participant_id_offset (chain, draw, v_C(trialtype)|participant_id__expr_dim, v_C(trialtype)|participant_id__factor_dim) float64 160kB ... v_1|participant_id_mu (chain, draw) float64 8kB 7.58... v (chain, draw, __obs__) float64 60MB ... p_outlier (chain, draw, __obs__) float64 60MB ... Attributes: created_at: 2025-03-04T23:05:02.491148+00:00 arviz_version: 0.19.0 inference_library: numpyro inference_library_version: 0.16.1 sampling_time: 801.139236 tuning_steps: 500 modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 60MB Dimensions: (chain: 2, draw: 500, __obs__: 7500) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 2 3 4 5 6 ... 493 494 495 496 497 498 499 * __obs__ (__obs__) int64 60kB 0 1 2 3 4 5 ... 7495 7496 7497 7498 7499 Data variables: rt,response (chain, draw, __obs__) float64 60MB -4.925 -0.3124 ... -1.842 Attributes: modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 53kB Dimensions: (chain: 2, draw: 500) Coordinates: * chain (chain) int64 16B 0 1 * draw (draw) int64 4kB 0 1 2 3 4 5 6 ... 494 495 496 497 498 499 Data variables: acceptance_rate (chain, draw) float64 8kB 0.9826 0.06897 ... 0.9448 0.8806 diverging (chain, draw) bool 1kB True True True ... False True False energy (chain, draw) float64 8kB 9.937e+03 9.936e+03 ... 9.946e+03 lp (chain, draw) float64 8kB 9.912e+03 9.912e+03 ... 9.909e+03 n_steps (chain, draw) int64 8kB 58 233 135 19 17 ... 74 511 21 127 step_size (chain, draw) float64 8kB 0.03672 0.03672 ... 0.02792 tree_depth (chain, draw) int64 8kB 6 8 8 5 5 4 7 8 ... 6 7 5 8 7 9 5 7 Attributes: created_at: 2025-03-04T23:05:02.538727+00:00 arviz_version: 0.19.0 modeling_interface: bambi modeling_interface_version: 0.15.0
-
<xarray.Dataset> Size: 180kB Dimensions: (__obs__: 7500, rt,response_extra_dim_0: 2) Coordinates: * __obs__ (__obs__) int64 60kB 0 1 2 3 ... 7497 7498 7499 * rt,response_extra_dim_0 (rt,response_extra_dim_0) int64 16B 0 1 Data variables: rt,response (__obs__, rt,response_extra_dim_0) float64 120kB ... Attributes: created_at: 2025-03-04T23:05:02.539979+00:00 arviz_version: 0.19.0 inference_library: numpyro inference_library_version: 0.16.1 sampling_time: 801.139236 tuning_steps: 500 modeling_interface: bambi modeling_interface_version: 0.15.0
Plot Results¶
In [ ]:
Copied!
# Define output dataframe for plotting
data_df['p_outlier'] = test_model_1_C.traces.posterior.p_outlier.mean(dim = ['chain', 'draw'])
data_df[['hdi_higher', 'hdi_lower']] = az.hdi(test_model_1_C.traces.posterior.p_outlier, hdi_prob = 0.95).to_dataframe()\
.reset_index().pivot(index = '__obs__', columns = 'hdi', values = 'p_outlier')
# Define output dataframe for plotting
data_df['p_outlier'] = test_model_1_C.traces.posterior.p_outlier.mean(dim = ['chain', 'draw'])
data_df[['hdi_higher', 'hdi_lower']] = az.hdi(test_model_1_C.traces.posterior.p_outlier, hdi_prob = 0.95).to_dataframe()\
.reset_index().pivot(index = '__obs__', columns = 'hdi', values = 'p_outlier')
In [ ]:
Copied!
grouped_df = data_df.groupby('participant_id')[['p_outlier', 'hdi_higher', 'hdi_lower']].mean()
grouped_df['p_outlier_gt'] = p_outlier_noise
grouped_df = data_df.groupby('participant_id')[['p_outlier', 'hdi_higher', 'hdi_lower']].mean()
grouped_df['p_outlier_gt'] = p_outlier_noise
In [ ]:
Copied!
grouped_df
grouped_df
Out[ ]:
p_outlier | hdi_higher | hdi_lower | p_outlier_gt | |
---|---|---|---|---|
participant_id | ||||
0 | 0.129354 | 0.157369 | 0.099355 | 0.121861 |
1 | 0.027036 | 0.041060 | 0.014176 | 0.022582 |
2 | 0.030726 | 0.044374 | 0.015506 | 0.019633 |
3 | 0.031000 | 0.044278 | 0.015119 | 0.022528 |
4 | 0.058709 | 0.079912 | 0.038487 | 0.040094 |
5 | 0.020425 | 0.034789 | 0.008272 | 0.012318 |
6 | 0.081973 | 0.105259 | 0.060237 | 0.071441 |
7 | 0.058888 | 0.078537 | 0.037285 | 0.047548 |
8 | 0.071373 | 0.094667 | 0.050075 | 0.049949 |
9 | 0.002771 | 0.010616 | -0.001944 | 0.001799 |
In [ ]:
Copied!
# Create forest plot
plt.figure(figsize=(10, 6))
# Plot HDI bands for each participant
for idx, row in grouped_df.iterrows():
plt.plot([row['hdi_lower'], row['hdi_higher']], [idx, idx], 'b-', linewidth=2)
plt.plot(row['p_outlier'], idx, 'b|') # Plot mean point
# Plot ground truth values as red crosses
plt.plot(grouped_df['p_outlier_gt'], grouped_df.index, 'rx', label='Ground Truth', markersize=10)
plt.xlabel('p_outlier)')
plt.ylabel('participant_id')
plt.title('Forest Plot of p_outlier Estimates with 95% HDI')
plt.legend()
plt.grid(True, alpha=0.3)
# Create forest plot
plt.figure(figsize=(10, 6))
# Plot HDI bands for each participant
for idx, row in grouped_df.iterrows():
plt.plot([row['hdi_lower'], row['hdi_higher']], [idx, idx], 'b-', linewidth=2)
plt.plot(row['p_outlier'], idx, 'b|') # Plot mean point
# Plot ground truth values as red crosses
plt.plot(grouped_df['p_outlier_gt'], grouped_df.index, 'rx', label='Ground Truth', markersize=10)
plt.xlabel('p_outlier)')
plt.ylabel('participant_id')
plt.title('Forest Plot of p_outlier Estimates with 95% HDI')
plt.legend()
plt.grid(True, alpha=0.3)