diff --git a/include/biginteger/algorithms/multiplication/NTTMultiplicationCrt.h b/include/biginteger/algorithms/multiplication/NTTMultiplicationCrt.h index 8178b4b..4930d9a 100644 --- a/include/biginteger/algorithms/multiplication/NTTMultiplicationCrt.h +++ b/include/biginteger/algorithms/multiplication/NTTMultiplicationCrt.h @@ -1207,25 +1207,11 @@ namespace BigMath ULong coeffCount = aCoeffSize + bCoeffSize - 1; Int n = (Int)std::bit_ceil(coeffCount); - // Three parallel transforms. PackOperand writes the coefficient head - // [0, *CoeffSize) of each buffer; only the zero-pad tail needs clearing. - // Using assign(n, 0) here would zero the whole buffer and then have - // PackOperand overwrite the head — double-writing ~aCoeffSize+bCoeffSize - // elements per call. On a bandwidth-bound kernel that waste is real, so - // resize (no fill at steady state, size already n) + tail-only fill. + // Three parallel transforms. static thread_local std::vector fa1, fb1, fa2, fb2, fa3, fb3; - fa1.resize(n); fb1.resize(n); - fa2.resize(n); fb2.resize(n); - fa3.resize(n); fb3.resize(n); - - const std::ptrdiff_t aHead = (std::ptrdiff_t)aCoeffSize; - const std::ptrdiff_t bHead = (std::ptrdiff_t)bCoeffSize; - std::fill(fa1.begin() + aHead, fa1.end(), 0u); - std::fill(fa2.begin() + aHead, fa2.end(), 0u); - std::fill(fa3.begin() + aHead, fa3.end(), 0u); - std::fill(fb1.begin() + bHead, fb1.end(), 0u); - std::fill(fb2.begin() + bHead, fb2.end(), 0u); - std::fill(fb3.begin() + bHead, fb3.end(), 0u); + fa1.assign(n, 0); fb1.assign(n, 0); + fa2.assign(n, 0); fb2.assign(n, 0); + fa3.assign(n, 0); fb3.assign(n, 0); PackOperand(a, base, fa1, fa2, fa3); PackOperand(b, base, fb1, fb2, fb3); @@ -1245,10 +1231,7 @@ namespace BigMath MfaPlanTree tree3; if (useMfa) { - // No zero-fill: each scratch is fully written by the step-1 Transpose - // before any read (recursive sub-calls likewise write their slice - // before reading). assign(n, 0) here was ~6n wasted writes per call. - for (int i = 0; i < 6; ++i) mfaScratch[i].resize(n); + for (int i = 0; i < 6; ++i) mfaScratch[i].assign(n, 0); // Pre-warm all plans in main thread: worker threads cannot call // GetPlan() safely from inside ParallelDo (BuildRoots reenters pool). BuildMfaPlanTree(n, tree1);