static const __m256 bv = [](void) -> __m256
{
float temp[]{ b, b, b, b, b, b, b, b };
return _mm256_load_ps(temp);
}();

__m256 thetadt = [&dt](void) -> __m256
{
float temp[]{ dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta };
return _mm256_load_ps(temp);
}();

for (int i = 0; i < count / 8; ++i)
{
// vs += (vs - (vs * vs * vs) / 3.f - ws - iext - ins) * exprdtv
__m256 nv = _mm256_mul_ps(vs[i], _mm256_mul_ps(vs[i], vs[i]));
nv = _mm256_sub_ps(vs[i], _mm256_div_ps(nv, div3));
nv = _mm256_sub_ps(nv, ws[i]);
nv = _mm256_add_ps(nv, iextv);
nv = _mm256_add_ps(nv, ins[i]);
nv = _mm256_mul_ps(nv, exprdtv);
vs[i] = _mm256_add_ps(vs[i], nv);

// ws += (vs - av - ws * bv) * thetadt
nv = _mm256_sub_ps(vs[i], av);
nv = _mm256_sub_ps(nv, _mm256_mul_ps(ws[i], bv));
ws[i] = _mm256_add_ps(ws[i], _mm256_mul_ps(nv, thetadt));
}

for (int i = 0; i < count / 8; ++i)
{
_mm256_storeu_ps(&neuronsV[i * 8], vs[i]);
_mm256_storeu_ps(&neuronsW[i * 8], ws[i]);
}
}

// scalar edition
for (int i = 0; i < neuronsV.size() % 8; ++i)
{
auto off = neuronsV.size() - 1 - i;
auto& v = neuronsV[off];
auto& w = neuronsW[off];
auto& in = neuronsIn[off];

v += (v - (v * v * v) / 3.f - w - iext - in) * expr * dt;
w += (v - a - w * b) * dt / theta;
}

std::ranges::fill(neuronsIn, 0.f);