diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 75c6aa5..b521884 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,12 +2,20 @@ name: build on: push: - branches: [master] + branches: [main] pull_request: - branches: [master] + branches: [main] + workflow_dispatch: + +# Builds and tests on Linux, macOS and Windows, each with a platform-specific +# optimization baseline. Release builds pick up the committed per-platform +# tuning profile (include/biginteger/build/platform/--.h) +# automatically via PlatformConfig.h. The library needs a GCC/Clang toolchain +# (__int128, __builtin_*_overflow), so the Windows job uses MSYS2 + Clang, not +# MSVC. jobs: - build: + unix: name: ${{ matrix.os }} / ${{ matrix.cc }} runs-on: ${{ matrix.os }} timeout-minutes: 30 @@ -57,5 +65,44 @@ jobs: run: ctest --output-on-failure --timeout 600 - name: Smoke-test calculator + run: echo '2^256 - 1' | ./build/calculator + + windows: + name: windows-latest / clang (msys2) + runs-on: windows-latest + timeout-minutes: 30 + defaults: + run: + shell: msys2 {0} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up MSYS2 + Clang + uses: msys2/setup-msys2@v2 + with: + msystem: UCRT64 + update: true + install: >- + mingw-w64-ucrt-x86_64-clang + mingw-w64-ucrt-x86_64-cmake + mingw-w64-ucrt-x86_64-ninja + + - name: Configure + env: + CC: clang + CXX: clang++ run: | - echo '2^256 - 1' | ./build/calculator + cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + "-DCMAKE_CXX_FLAGS_RELEASE=-O3 -march=x86-64-v3 -DNDEBUG" + + - name: Build + run: cmake --build build -j + + - name: Test + working-directory: build + run: ctest --output-on-failure --timeout 600 + + - name: Smoke-test calculator + run: echo '2^256 - 1' | ./build/calculator.exe diff --git a/.github/workflows/tune.yml b/.github/workflows/tune.yml new file mode 100644 index 0000000..ca83656 --- /dev/null +++ b/.github/workflows/tune.yml @@ -0,0 +1,143 @@ +name: tune + +# Per-platform dispatch-threshold tuning. +# +# Manual trigger only. For each OS/arch/compiler it builds and runs +# tests/performance/dispatch_tuner (Release, -march=native), which emits a +# profile header into include/biginteger/build/platform/.h. The profiles +# are collected and a single PR is opened for review. +# +# CAVEAT: GitHub-hosted runners are shared VMs whose host CPU varies between +# runs, and the build uses -march=native. Values produced here reflect *that +# runner's* CPU, not a canonical chip — treat the PR as a baseline and prefer +# tuning on real target hardware for values you depend on. + +on: + workflow_dispatch: + inputs: + mode: + description: "Tuner mode (full = wider size sweep, slower)" + type: choice + default: full + options: [full, quick] + +permissions: + contents: write + pull-requests: write + +jobs: + tune-unix: + name: tune ${{ matrix.key }} + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + key: linux-x86_64-gcc + cc: gcc + cxx: g++ + - os: ubuntu-latest + key: linux-x86_64-clang + cc: clang + cxx: clang++ + - os: macos-latest + key: macos-arm64-clang + cc: clang + cxx: clang++ + steps: + - uses: actions/checkout@v4 + + - name: Build tuner + env: + CC: ${{ matrix.cc }} + CXX: ${{ matrix.cxx }} + run: | + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build --target dispatch_tuner -j + + - name: Run tuner + run: | + MODE_FLAG="" + [ "${{ inputs.mode }}" = "full" ] && MODE_FLAG="--full" + ./build/dispatch_tuner $MODE_FLAG --emit-header \ + "include/biginteger/build/platform/${{ matrix.key }}.h" + + - uses: actions/upload-artifact@v4 + with: + name: profile-${{ matrix.key }} + path: include/biginteger/build/platform/${{ matrix.key }}.h + if-no-files-found: error + + tune-windows: + name: tune windows-x86_64-clang + runs-on: windows-latest + timeout-minutes: 60 + defaults: + run: + shell: msys2 {0} + steps: + - uses: actions/checkout@v4 + + - uses: msys2/setup-msys2@v2 + with: + msystem: UCRT64 + update: true + install: >- + mingw-w64-ucrt-x86_64-clang + mingw-w64-ucrt-x86_64-cmake + mingw-w64-ucrt-x86_64-ninja + + - name: Build tuner + env: + CC: clang + CXX: clang++ + run: | + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release + cmake --build build --target dispatch_tuner -j + + - name: Run tuner + run: | + MODE_FLAG="" + [ "${{ inputs.mode }}" = "full" ] && MODE_FLAG="--full" + ./build/dispatch_tuner.exe $MODE_FLAG --emit-header \ + "include/biginteger/build/platform/windows-x86_64-clang.h" + + - uses: actions/upload-artifact@v4 + with: + name: profile-windows-x86_64-clang + path: include/biginteger/build/platform/windows-x86_64-clang.h + if-no-files-found: error + + open-pr: + name: Collect profiles and open PR + needs: [tune-unix, tune-windows] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Download tuned profiles + uses: actions/download-artifact@v4 + with: + pattern: profile-* + merge-multiple: true + path: include/biginteger/build/platform + + - name: Open pull request + uses: peter-evans/create-pull-request@v6 + with: + branch: auto/platform-tuning + title: "Refresh platform tuning profiles" + commit-message: "Regenerate per-platform dispatch threshold profiles" + body: | + Auto-generated by the **tune** workflow (mode: `${{ inputs.mode }}`). + + Each `include/biginteger/build/platform/--.h` was + produced by `dispatch_tuner` on its respective CI runner. + + ⚠️ CI runners are shared VMs with varying host CPUs and the build + uses `-march=native`. Review the deltas before merging; for values + you depend on, regenerate on real target hardware. + add-paths: include/biginteger/build/platform/*.h + delete-branch: true diff --git a/.gitignore b/.gitignore index 4805b9a..bebd04e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ build/ *.gz.* .antigravitycli/ .benchcache/ + +# Track build-config headers + platform tuning profiles (the broad build +# rule above also matches include/biginteger/build/). +!include/biginteger/build/ +!include/biginteger/build/** diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c991a0..164b002 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,8 +106,12 @@ if(BIGMATH_BUILD_TESTS) add_executable(divperf_simple tests/divperf_simple.cpp) target_link_libraries(divperf_simple PRIVATE bigmath::bigmath) - add_executable(regression_bench tests/performance/regression_bench.cpp) - target_link_libraries(regression_bench PRIVATE bigmath::bigmath) + # regression_bench.cpp is an ad-hoc, uncommitted scratch bench; only wire it + # up when it's actually present so a clean checkout (CI) still configures. + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tests/performance/regression_bench.cpp") + add_executable(regression_bench tests/performance/regression_bench.cpp) + target_link_libraries(regression_bench PRIVATE bigmath::bigmath) + endif() # BigDecimal performance bench add_executable(bigdecimal_perf tests/bigdecimal_perf.cpp) diff --git a/docs/PLATFORM_TUNING.md b/docs/PLATFORM_TUNING.md new file mode 100644 index 0000000..9f429bf --- /dev/null +++ b/docs/PLATFORM_TUNING.md @@ -0,0 +1,65 @@ +# Platform tuning + +Dispatch thresholds (when multiplication switches Classic → Karatsuba → Toom → +NTT, when division switches Fast → Burnikel-Ziegler → Newton, etc.) have +crossover points that depend on the CPU and compiler. BigMath ships portable +defaults and lets each platform override them with a tuned profile. + +## How resolution works + +`common/Constants.h` includes, in order (first definition wins; every threshold +macro is `#ifndef`-guarded): + +1. command-line `-D` overrides +2. `build/PlatformConfig.h` → the per-platform tuned profile, if one exists +3. `build/DispatchThresholds.h` → generic portable defaults + +`PlatformConfig.h` auto-selects a profile with `__has_include`, keyed on the host +OS / arch / compiler: + +``` +include/biginteger/build/platform/--.h +``` + +e.g. `linux-x86_64-gcc.h`, `macos-arm64-clang.h`, `windows-x86_64-clang.h`. If +the matching file isn't committed, the defaults apply unchanged — no profile is +required to build. A profile only `#define`s the macros it tuned, so it overrides +individual defaults without disabling the fallback for the rest. + +Escape hatches: `-DBIGMATH_PLATFORM_OVERRIDE='"path/to/profile.h"'` forces a +specific profile; `-DBIGMATH_PLATFORM_NONE` disables auto-selection. + +## Generating a profile + +### On real hardware (recommended) + +```sh +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release # adds -march=native +cmake --build build --target dispatch_tuner -j +./build/dispatch_tuner --full --emit-header \ + include/biginteger/build/platform/--.h +``` + +`--full` runs a wider size sweep (slower, more accurate). Drop it for a quick +pass. Commit the emitted file; `PlatformConfig.h` finds it automatically on the +next build. + +### Via CI + +The **tune** workflow (`.github/workflows/tune.yml`, manual `workflow_dispatch`) +runs the tuner across the CI matrix (Linux gcc/clang, macOS clang, Windows +clang) and opens a single PR with the regenerated profiles. + +> ⚠️ **CI values are not canonical.** GitHub-hosted runners are shared VMs whose +> host CPU varies between runs, and the build uses `-march=native`. A profile +> generated on CI reflects *that runner's* CPU. Use the workflow for convenience +> and as a baseline; for values you rely on, tune on the deployment hardware. + +## Adding a new platform + +1. Add a matrix entry (and, if a new OS, a job) to `tune.yml` with a `key` + matching the `--.h` naming. +2. Add the corresponding auto-selection branch to `PlatformConfig.h`. + +The Windows path uses MSYS2 + Clang because the library requires a GCC/Clang +toolchain (`__int128`, `__builtin_*_overflow`); MSVC cannot compile it. diff --git a/include/biginteger/build/PlatformConfig.h b/include/biginteger/build/PlatformConfig.h new file mode 100644 index 0000000..f8db2e5 --- /dev/null +++ b/include/biginteger/build/PlatformConfig.h @@ -0,0 +1,83 @@ +#ifndef BIGMATH_PLATFORM_CONFIG +#define BIGMATH_PLATFORM_CONFIG + +// Platform-specific tuned dispatch thresholds. +// +// Pulled in by common/Constants.h *before* build/DispatchThresholds.h. A tuned +// profile only #defines the threshold macros it measured (each #ifndef-guarded), +// so it wins over the generic defaults without disabling the fallback for any +// macro it did not set. +// +// Profiles live under include/biginteger/build/platform/ and are named +// +// --.h e.g. linux-x86_64-gcc.h, macos-arm64-clang.h +// +// Selection is automatic: the branches below match the host OS/arch/compiler and +// include the matching profile *if it exists* (via __has_include). With no +// profile committed for the host, the defaults in DispatchThresholds.h apply +// unchanged — no profile is required to build. +// +// Generate a profile with the tuner (see docs/PLATFORM_TUNING.md and +// .github/workflows/tune.yml): +// +// dispatch_tuner --full --emit-header \ +// include/biginteger/build/platform/--.h +// +// Escape hatches: +// -DBIGMATH_PLATFORM_OVERRIDE='"path/to/profile.h"' force a specific profile +// -DBIGMATH_PLATFORM_NONE disable auto-selection + +#if defined(BIGMATH_PLATFORM_OVERRIDE) + +#include BIGMATH_PLATFORM_OVERRIDE + +#elif !defined(BIGMATH_PLATFORM_NONE) + +// ─── architecture detection ─────────────────────────────────────────────────── +#if defined(__x86_64__) || defined(_M_X64) +#define BIGMATH_PLATFORM_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define BIGMATH_PLATFORM_ARM64 1 +#endif + +// ─── auto-selection (clang checked before gcc: clang also defines __GNUC__) ──── +#if defined(__APPLE__) && defined(BIGMATH_PLATFORM_ARM64) && defined(__clang__) +#if __has_include("biginteger/build/platform/macos-arm64-clang.h") +#include "biginteger/build/platform/macos-arm64-clang.h" +#endif + +#elif defined(__APPLE__) && defined(BIGMATH_PLATFORM_X86_64) && defined(__clang__) +#if __has_include("biginteger/build/platform/macos-x86_64-clang.h") +#include "biginteger/build/platform/macos-x86_64-clang.h" +#endif + +#elif defined(_WIN32) && defined(BIGMATH_PLATFORM_X86_64) && defined(__clang__) +#if __has_include("biginteger/build/platform/windows-x86_64-clang.h") +#include "biginteger/build/platform/windows-x86_64-clang.h" +#endif + +#elif defined(__linux__) && defined(BIGMATH_PLATFORM_X86_64) && defined(__clang__) +#if __has_include("biginteger/build/platform/linux-x86_64-clang.h") +#include "biginteger/build/platform/linux-x86_64-clang.h" +#endif + +#elif defined(__linux__) && defined(BIGMATH_PLATFORM_X86_64) && defined(__GNUC__) && !defined(__clang__) +#if __has_include("biginteger/build/platform/linux-x86_64-gcc.h") +#include "biginteger/build/platform/linux-x86_64-gcc.h" +#endif + +#elif defined(__linux__) && defined(BIGMATH_PLATFORM_ARM64) && defined(__clang__) +#if __has_include("biginteger/build/platform/linux-arm64-clang.h") +#include "biginteger/build/platform/linux-arm64-clang.h" +#endif + +#elif defined(__linux__) && defined(BIGMATH_PLATFORM_ARM64) && defined(__GNUC__) && !defined(__clang__) +#if __has_include("biginteger/build/platform/linux-arm64-gcc.h") +#include "biginteger/build/platform/linux-arm64-gcc.h" +#endif + +#endif + +#endif // selection + +#endif diff --git a/include/biginteger/build/platform/README.md b/include/biginteger/build/platform/README.md new file mode 100644 index 0000000..9eede4f --- /dev/null +++ b/include/biginteger/build/platform/README.md @@ -0,0 +1,41 @@ +# Platform tuning profiles + +Auto-generated dispatch-threshold profiles, one per `--`: + +| File | Host | +|------|------| +| `linux-x86_64-gcc.h` | Linux / x86-64 / GCC | +| `linux-x86_64-clang.h` | Linux / x86-64 / Clang | +| `linux-arm64-gcc.h` | Linux / ARM64 / GCC | +| `linux-arm64-clang.h` | Linux / ARM64 / Clang | +| `macos-arm64-clang.h` | macOS / Apple Silicon / Clang | +| `macos-x86_64-clang.h` | macOS / Intel / Clang | +| `windows-x86_64-clang.h` | Windows / x86-64 / Clang (MSYS2) | + +`../PlatformConfig.h` includes the matching file automatically via +`__has_include` — if a profile for the host isn't present, the generic defaults +in `../DispatchThresholds.h` apply. Each profile only `#define`s the macros it +tuned (every one `#ifndef`-guarded), so it overrides the defaults without +disabling the fallback. + +## Generating / refreshing a profile + +These files are produced by `tests/performance/dispatch_tuner`. Don't hand-edit. + +- On real target hardware: + ``` + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build --target dispatch_tuner -j + ./build/dispatch_tuner --full --emit-header \ + include/biginteger/build/platform/--.h + ``` +- Or trigger the **Tune** workflow (`.github/workflows/tune.yml`, + `workflow_dispatch`) to run the tuner on the CI matrix and open a PR with the + regenerated profiles. + +See [docs/PLATFORM_TUNING.md](../../../../docs/PLATFORM_TUNING.md). + +> **Note on CI-tuned values.** GitHub-hosted runners are shared VMs whose host +> CPU varies between runs, and `-march=native` builds tune for that CPU. +> Profiles generated on CI are a baseline, not canonical — run the tuner on the +> actual deployment hardware for values you depend on. diff --git a/include/biginteger/common/Constants.h b/include/biginteger/common/Constants.h index 56312f9..a76d359 100644 --- a/include/biginteger/common/Constants.h +++ b/include/biginteger/common/Constants.h @@ -38,14 +38,18 @@ #define BIGMATH_USE_THREADS 1 #endif -#ifndef BIGMATH_MAX_THREADS -#define BIGMATH_MAX_THREADS 8 -#endif - -#include "../build/DispatchThresholds.h" - -namespace BigMath -{ +#ifndef BIGMATH_MAX_THREADS +#define BIGMATH_MAX_THREADS 8 +#endif + +// Per-platform tuned thresholds (auto-selected if a profile exists), then the +// generic defaults. Both define the same #ifndef-guarded macros, so the profile +// wins where present and the defaults fill the rest. +#include "../build/PlatformConfig.h" +#include "../build/DispatchThresholds.h" + +namespace BigMath +{ typedef int64_t Long; typedef uint64_t ULong; diff --git a/tests/performance/dispatch_tuner.cpp b/tests/performance/dispatch_tuner.cpp index 07211c2..86e35fd 100644 --- a/tests/performance/dispatch_tuner.cpp +++ b/tests/performance/dispatch_tuner.cpp @@ -112,10 +112,13 @@ namespace if (!out) throw runtime_error("failed to open dispatch threshold header: " + path); - out << "#ifndef BIGMATH_DISPATCH_THRESHOLDS\n"; - out << "#define BIGMATH_DISPATCH_THRESHOLDS\n\n"; + out << "#pragma once\n\n"; out << "// Generated by tests/performance/dispatch_tuner.cpp --emit-header.\n"; - out << "// Regenerate this file after benchmarking on a new compiler or CPU.\n\n"; + out << "// Platform tuning profile: included ahead of DispatchThresholds.h by\n"; + out << "// PlatformConfig.h. Each macro is #ifndef-guarded so it overrides the\n"; + out << "// generic default without disabling the fallback. Do not hand-edit;\n"; + out << "// regenerate on the target CPU/compiler (see docs/PLATFORM_TUNING.md).\n"; + out << "// NOTE: values reflect the CPU this ran on; -march=native is assumed.\n\n"; out << "#ifndef BIGMATH_CLASSIC_MULTIPLICATION_THRESHOLD\n"; out << "#if BIGMATH_LIMB_64\n"; @@ -191,8 +194,6 @@ namespace out << "#ifndef BIGMATH_NEWTON_HIGH_SKEW_DENOMINATOR\n"; out << "#define BIGMATH_NEWTON_HIGH_SKEW_DENOMINATOR " << s.newtonHighSkewDenominator << '\n'; - out << "#endif\n\n"; - out << "#endif\n"; }