diff --git a/.buildkite/db_sync_full_sync.yml b/.buildkite/db_sync_full_sync.yml deleted file mode 100644 index 2aabe82a..00000000 --- a/.buildkite/db_sync_full_sync.yml +++ /dev/null @@ -1,18 +0,0 @@ -steps: - - label: ':drum_with_drumsticks: Full sync test :drum_with_drumsticks:' - commands: - - > - nix develop --accept-flake-config .#python --command - pytest sync_tests/tests/ -m "node_sync or db_sync" - --environment "${environment}" - --node-revision "${node_version}" - --db-sync-revision "${db_sync_version}" - --db-sync-start-options "${db_sync_start_arguments}" - timeout_in_minutes: 14400 - artifact_paths: - - "sync_logs.zip" - - "sync_results.zip" - - "monitor.zip" - agents: - system: x86_64-linux - queue: core-tech-bench diff --git a/.buildkite/node-db-sync.yml b/.buildkite/node-db-sync.yml deleted file mode 100644 index 2b1cee91..00000000 --- a/.buildkite/node-db-sync.yml +++ /dev/null @@ -1,18 +0,0 @@ -steps: - - label: ':drum_with_drumsticks: Node + DB sync :drum_with_drumsticks:' - commands: - - > - nix develop --accept-flake-config .#python --command - pytest sync_tests/tests/ -m "node_sync or db_sync" - --environment "${environment}" - --node-revision "${node_version}" - --db-sync-revision "${db_sync_version}" - --db-sync-start-options "${db_sync_start_arguments}" - timeout_in_minutes: 14400 - artifact_paths: - - "sync_logs.zip" - - "sync_results.zip" - - "monitor.zip" - agents: - system: x86_64-linux - queue: "${queue_name:-core-tech-bench}" diff --git a/.buildkite/node_sync_tests.yml b/.buildkite/node_sync_tests.yml deleted file mode 100644 index aec2ac01..00000000 --- a/.buildkite/node_sync_tests.yml +++ /dev/null @@ -1,16 +0,0 @@ -steps: - - label: ':drum_with_drumsticks: Run the Cardano node sync test on Mainnet using a Linux machine' - commands: - - > - nix develop --accept-flake-config .#python --command - pytest sync_tests/tests/test_nodesync_artifacts.py - --environment "${env}" - --node-revision "${tag_no1}" - timeout_in_minutes: 14400 - agents: - system: x86_64-linux - queue: "${queue_name:-core-tech-bench}" - artifact_paths: - - "sync_logs.zip" - - "sync_results.zip" - - "monitor.zip" diff --git a/.github/workflows/node_mainnet_tx_count_per_epoch.yaml b/.github/workflows/node_mainnet_tx_count_per_epoch.yaml index c1b09b56..ef23eb64 100644 --- a/.github/workflows/node_mainnet_tx_count_per_epoch.yaml +++ b/.github/workflows/node_mainnet_tx_count_per_epoch.yaml @@ -26,7 +26,6 @@ jobs: - name: Get the automated tests results env: - BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_API_ACCESS_TOKEN }} AWS_DB_USERNAME: ${{ secrets.AWS_DB_USERNAME }} AWS_DB_PASS: ${{ secrets.AWS_DB_PASS }} AWS_DB_NAME: ${{ secrets.AWS_DB_NAME }} diff --git a/.github/workflows/node_sync_test_genesis.yaml b/.github/workflows/node_sync_test_genesis.yaml deleted file mode 100644 index 55fa532b..00000000 --- a/.github/workflows/node_sync_test_genesis.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: Genesis node sync tests - -on: - workflow_dispatch: - inputs: - environment: - description: "Environment on which Buildkite agent will run tests" - type: choice - options: - - mainnet - - preprod - - preview - - shelley-qa - default: preprod - tag_no1: - description: "Initial sync - Graph axis label" - required: true - default: "10.2.1" - node_rev1: - description: "Initial sync - cardano-node revision" - required: true - default: "tags/10.2.1" - node_topology1: - description: "Initial sync - cardano-node topology" - type: choice - options: - - non-bootstrap-peers - - bootstrap-peers - default: legacy - use_genesis_mode: - description: "Use Ouroboros Genesis mode" - type: boolean - default: false - buildkite_queue_name: - description: "Buildkite queue name to use" - type: choice - options: - - core-tech-bench - - core-tech-bench-af - - core-tech-bench-ap - - core-tech-bench-eu - - core-tech-bench-sa - buildkite_pipeline: - description: "Buildkite pipeline to use" - type: choice - options: - - node-sync-tests - - node-sync-tests-af - - node-sync-tests-ap - - node-sync-tests-eu - - node-sync-tests-sa - - -jobs: - node_sync_test_buildkite_mainnet: - runs-on: ubuntu-latest - steps: - - name: Get latest node tag - id: get_tag - shell: bash - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - echo "Using manual input for tag_no1" - echo "node_rev1=${{ github.event.inputs.node_rev1 }}" >> $GITHUB_OUTPUT - else - echo "Fetching latest tag for scheduled run" - # Fetch the latest tag dynamically - node_release_url='https://api.github.com/repos/IntersectMBO/cardano-node/releases/latest' - latest_tag=$(curl -s $node_release_url | jq .tag_name -r) - echo "node_rev1=$latest_tag" >> $GITHUB_OUTPUT - # Fetch more details - latest_name=$(curl -s $node_release_url | jq .name -r) - latest_url=$(curl -s $node_release_url | jq .html_url -r) - latest_archive=$(curl -s $node_release_url | jq .assets[0].browser_download_url -r) - - echo "Latest Release Name: $latest_name" - echo "Release URL: $latest_url" - echo "Download Archive URL: $latest_archive" - fi - - - - name: Trigger the Buildkite pipeline - run sync tests on Mainnet - uses: 'buildkite/trigger-pipeline-action@v2.4.1' - env: - BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_API_ACCESS_TOKEN }} - PIPELINE: "input-output-hk/${{ github.event.inputs.buildkite_pipeline }}" - BRANCH: ${{ github.ref_name }} - MESSAGE: ':github: Triggered by GitHub Action' - AWS_DB_USERNAME: ${{ secrets.AWS_DB_USERNAME }} - AWS_DB_PASS: ${{ secrets.AWS_DB_PASS }} - AWS_DB_NAME: ${{ secrets.AWS_DB_NAME }} - AWS_DB_HOSTNAME: ${{ secrets.AWS_DB_HOSTNAME }} - BLOCKFROST_API_KEY: ${{ secrets.BLOCKFROST_API_KEY }} - BUILD_ENV_VARS: '{ - "env":"${{ github.event.inputs.environment }}", - "node_rev1":"${{ steps.get_tag.outputs.node_rev1 }}", - "node_rev2":"None", - "tag_no1":"${{ github.event.inputs.tag_no1 }}", - "tag_no2":"None", - "node_topology1":"${{ github.event.inputs.node_topology1 }}", - "node_topology2":"None", - "node_start_arguments1":"None", - "node_start_arguments2":"None", - "use_genesis_mode":"${{ github.event.inputs.use_genesis_mode == \"true\" && \"-g\" || \"\" }}", - "queue_name":"${{ github.event.inputs.buildkite_queue_name }}", - "CARDANO_NODE_SOCKET_PATH":"./db/node.socket", - "BLOCKFROST_API_KEY":"${{ secrets.BLOCKFROST_API_KEY }}", - "AWS_DB_USERNAME":"${{ secrets.AWS_DB_USERNAME }}", - "AWS_DB_PASS":"${{ secrets.AWS_DB_PASS }}", - "AWS_DB_NAME":"${{ secrets.AWS_DB_NAME }}", - "AWS_DB_HOSTNAME":"${{ secrets.AWS_DB_HOSTNAME }}" - }' diff --git a/.github/workflows/weekly_node_sync_test.yaml b/.github/workflows/weekly_node_sync_test.yaml deleted file mode 100644 index 89aed621..00000000 --- a/.github/workflows/weekly_node_sync_test.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: Weekly node sync tests - -on: - # schedule: - # - cron: '0 23 * * 0' # Every Sunday at 23:00 UTC - workflow_dispatch: - -jobs: - weekly_job: - runs-on: ubuntu-latest - - steps: - - name: Get graph label with run date - id: get_graph_label - shell: bash - run: | - current_date=$(date +'%Y-%m-%d') - echo "tag_no1=weekly_node_$current_date" >> $GITHUB_OUTPUT - - - name: Trigger Buildkite pipeline for Mainnet sync tests - uses: 'buildkite/trigger-pipeline-action@v2.4.1' - with: - buildkite_api_access_token: ${{ secrets.BUILDKITE_API_ACCESS_TOKEN }} - pipeline: 'input-output-hk/node-sync-tests' - branch: ${{ github.ref_name || 'main' }} - message: ':github: Triggered by GitHub Action' - build_env_vars: | - { - "env": "mainnet", - "node_rev1": "master", - "tag_no1": "${{ steps.get_graph_label.outputs.tag_no1 }}", - "node_topology1": "non-bootstrap-peers", - "node_start_arguments1": "", - "node_rev2": "", - "tag_no2": "", - "node_topology2": "", - "node_start_arguments2": "", - "BLOCKFROST_API_KEY": "${{ secrets.BLOCKFROST_API_KEY }}", - "AWS_DB_USERNAME": "${{ secrets.AWS_DB_USERNAME }}", - "AWS_DB_PASS": "${{ secrets.AWS_DB_PASS }}", - "AWS_DB_NAME": "${{ secrets.AWS_DB_NAME }}", - "AWS_DB_HOSTNAME": "${{ secrets.AWS_DB_HOSTNAME }}" - } - - - name: Finalize sync - run: echo "Sync job finished." diff --git a/README.md b/README.md index 326024f2..9b05189e 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,6 @@ Sync tests for `cardano-node` and `cardano-db-sync`. -## Documentation - -| Doc | Purpose | -|-----|---------| -| [`doc/LEARNING.md`](doc/LEARNING.md) | Index of learning and troubleshooting docs | -| [`doc/SYNC_TEST_TROUBLESHOOTING.md`](doc/SYNC_TEST_TROUBLESHOOTING.md) | Buildkite, Postgres, Nix, stuck runs | -| [`history.md`](history.md) | Branch timeline, incidents, owner conventions | -| [`ISSUES_ENCOUNTERED.md`](ISSUES_ENCOUNTERED.md) | Known issues and fixes | -| [`CURSOR_GUIDE.md`](CURSOR_GUIDE.md) | Style and where to save chat takeaways | - ## Setup (local, without Nix) From the repository root: @@ -105,15 +95,16 @@ pytest sync_tests/tests/ \ When db-sync tests are included, `test_dbsync_artifacts.py` generates a comprehensive test results JSON and checks db-sync logs for errors and rollbacks. -Current CI artifact behavior (Buildkite `artifact_paths`): +GitHub Actions uploads these bundles when CI finishes: - `sync_logs.zip` - `sync_results.zip` - `monitor.zip` Bundle generation is CI-agnostic and keyed off standard CI variables -(`CI`, `GITHUB_ACTIONS`, `BUILDKITE`, `GITLAB_CI`, `CIRCLECI`). The runner uploads -the zips; local runs keep raw files under `test_workdir/` for debugging. +(`CI`, `GITHUB_ACTIONS`, `GITLAB_CI`, `CIRCLECI`). The workflow uploads the zips +with `actions/upload-artifact`; local runs keep raw files under `test_workdir/` +for debugging. Results JSON includes enriched performance samples under `system_metrics`. @@ -220,17 +211,3 @@ When using `test_workdir/cardano-db-sync/db_sync_performance_stats.json` (list o - **Available graphs**: CPU, RSS, Combined Resources - **Empty graphs**: Epoch Duration, Blocks per Epoch, Block Throughput - **Recommendation**: Use full `test_workdir/db_sync_results.json` for complete graph set - -## Document Index - -| Document | Purpose | -|----------|---------| -| [`doc/LEARNING.md`](doc/LEARNING.md) | Short index: doc map, high-signal facts, debugging entry point | -| [`history.md`](history.md) | Branch timeline, local notes, *What Martin wants*, chat archive pointers | -| [`doc/SYNC_TEST_TROUBLESHOOTING.md`](doc/SYNC_TEST_TROUBLESHOOTING.md) | Stuck runs, Buildkite, disk / Postgres, Nix `cardano-cli`, agent variance, pipeline YAML (§11) | -| [`ISSUES_ENCOUNTERED.md`](ISSUES_ENCOUNTERED.md) | Short issue → fix index | -| [`CURSOR_GUIDE.md`](CURSOR_GUIDE.md) | Style, pytest, Buildkite, where to save chat takeaways | -| [`SYNC_TESTS_ARCHITECTURE.md`](SYNC_TESTS_ARCHITECTURE.md) | Module layout and pytest model | -| [`SYNC_TESTS_COMPREHENSIVE_GUIDE.md`](SYNC_TESTS_COMPREHENSIVE_GUIDE.md) | Setup, CLI, markers, Buildkite env mapping | -| [`NODE_SYNC_TEST_EXECUTION_FLOW.md`](NODE_SYNC_TEST_EXECUTION_FLOW.md) | Node-only pytest flow vs legacy script context | -| [`REFRACTOR_PLAN.md`](REFRACTOR_PLAN.md) | Planned refactors | diff --git a/sync_tests/scripts/postgres-start.sh b/sync_tests/scripts/postgres-start.sh index 8a50275f..9a836bdb 100755 --- a/sync_tests/scripts/postgres-start.sh +++ b/sync_tests/scripts/postgres-start.sh @@ -7,7 +7,7 @@ POSTGRES_DIR="${1:?"Need path to postgres dir"}" POSTGRES_DIR="$(readlink -m "$POSTGRES_DIR")" # Postgres refuses to start if the full Unix socket path exceeds ~107 bytes -# (directory + "/.s.PGSQL."). Long CI workdirs (e.g. Buildkite) need a +# (directory + "/.s.PGSQL."). Long CI workdirs need a # short -k directory; PGDATA stays under POSTGRES_DIR/data. _MAX_SOCK_PARENT_LEN=$((107 - 22)) SOCKET_DIR="$POSTGRES_DIR" diff --git a/sync_tests/tests/test_local_snapshot_restoration.py b/sync_tests/tests/test_local_snapshot_restoration.py index fc301238..8142ef50 100644 --- a/sync_tests/tests/test_local_snapshot_restoration.py +++ b/sync_tests/tests/test_local_snapshot_restoration.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +import json import logging import os import pathlib as pl @@ -14,6 +15,7 @@ from _pytest.fixtures import FixtureRequest from sync_tests.tests.conftest import SyncContext +from sync_tests.tests.test_snapshot_creation import snapshot_created # noqa: F401 from sync_tests.utils import db_sync from sync_tests.utils import helpers from sync_tests.utils import node @@ -30,12 +32,14 @@ def local_restoration_result( request: FixtureRequest, sync_context: SyncContext, + snapshot_created: dict[str, tp.Any], # noqa: ARG001,F811 ) -> tp.Generator[dict[str, tp.Any], None, None]: """Restore db-sync from a local snapshot, sync, and yield result data. Args: request: Pytest fixture request for CLI options. sync_context: Shared session context. + snapshot_created: Fixture dependency that creates snapshot metadata first. Yields: Dict with restoration timing, tip data, and sync results. @@ -75,7 +79,13 @@ def local_restoration_result( db_sync.create_database() # restore snapshot - snapshot_file = db_sync.get_buildkite_meta_data("snapshot_file") + snapshot_state_file = sync_context.workdir / "sync_session_state.json" + if not snapshot_state_file.exists(): + msg = f"Snapshot metadata file not found: {snapshot_state_file}" + raise FileNotFoundError(msg) + with open(snapshot_state_file, encoding="utf-8") as state_fh: + snapshot_data = json.load(state_fh) + snapshot_file = snapshot_data["snapshot_file"] LOGGER.info("Restoring from snapshot: %s", snapshot_file) restoration_time = db_sync.restore_db_sync_from_snapshot( config, diff --git a/sync_tests/tests/test_snapshot_creation.py b/sync_tests/tests/test_snapshot_creation.py index c87394be..d6424a0e 100644 --- a/sync_tests/tests/test_snapshot_creation.py +++ b/sync_tests/tests/test_snapshot_creation.py @@ -49,12 +49,11 @@ def snapshot_created( end_time = datetime.datetime.now(tz=datetime.timezone.utc) snapshot_file = stage_2_result - db_sync.set_buildkite_meta_data("snapshot_file", snapshot_file) creation_secs = int((end_time - start_time).total_seconds()) LOGGER.info("Snapshot creation time: %d seconds", creation_secs) - return { + snapshot_data = { "snapshot_file": snapshot_file, "stage_2_cmd": stage_2_cmd, "stage_2_result": stage_2_result, @@ -62,6 +61,8 @@ def snapshot_created( "start_time": start_time.strftime("%d/%m/%Y %H:%M:%S"), "end_time": end_time.strftime("%d/%m/%Y %H:%M:%S"), } + helpers.write_json_to_file(sync_context.workdir / "sync_session_state.json", snapshot_data) + return snapshot_data class TestSnapshotCreation: diff --git a/sync_tests/utils/artifacts/__init__.py b/sync_tests/utils/artifacts/__init__.py index c2d4f2e1..d25c42ac 100644 --- a/sync_tests/utils/artifacts/__init__.py +++ b/sync_tests/utils/artifacts/__init__.py @@ -6,8 +6,6 @@ import os import pathlib as pl import shutil -import subprocess -import typing as tp from sync_tests.utils import helpers from sync_tests.utils.db_sync.config import DbSyncConfig @@ -17,74 +15,37 @@ LOGGER = logging.getLogger(__name__) -def is_buildkite_available() -> bool: - """Check if Buildkite agent is available.""" - try: - subprocess.run( - ["buildkite-agent", "--version"], - capture_output=True, - check=True, - timeout=5, - ) - except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): - return False - return True - - def is_ci_environment() -> bool: """Check if we're running in a CI environment across supported providers.""" return bool( os.getenv("CI") or os.getenv("GITHUB_ACTIONS") - or os.getenv("BUILDKITE") or os.getenv("GITLAB_CI") or os.getenv("CIRCLECI") ) def upload_artifact(file: str, destination: str = "auto", local_dir: pl.Path | None = None) -> None: - """Upload an artifact to Buildkite if available, otherwise save locally. + """Keep an artifact available for local inspection or CI bundle collection. Args: - file: Path to the file to upload. - destination: Upload destination ("buildkite", "auto"). Defaults to "auto". - local_dir: Optional directory to save file locally if Buildkite is not available. + file: Path to the artifact file. + destination: Retained for call-site compatibility; direct CI uploads are + handled by workflow artifact steps. + local_dir: Optional directory to copy the file to for local collection. """ - if destination in ("buildkite", "auto"): - try: - cmd = ["buildkite-agent", "artifact", "upload", f"{file}"] - subprocess.run(cmd, check=True, timeout=120) - except FileNotFoundError: - LOGGER.warning("Buildkite agent not available.") - except subprocess.TimeoutExpired: - LOGGER.warning("Timed out uploading artifact to Buildkite: %s", file) - except subprocess.CalledProcessError as exc: - LOGGER.warning( - "Buildkite artifact upload failed (exit %s) for %s", - exc.returncode, - file, - ) - else: - LOGGER.info("Uploaded %s to Buildkite.", file) - return - - # If Buildkite not available and local_dir is provided, save locally + _ = destination if local_dir: local_path = pl.Path(local_dir) / pl.Path(file).name # Only copy if the file is not already in the target location if pl.Path(file).resolve() != local_path.resolve(): local_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(file, local_path) - LOGGER.info("Saved artifact locally to %s (no Buildkite available).", local_path) + LOGGER.info("Saved artifact locally to %s.", local_path) else: - LOGGER.info( - "Artifact already in target location: %s (no Buildkite available).", local_path - ) + LOGGER.info("Artifact already in target location: %s.", local_path) else: - LOGGER.warning( - "Skipping artifact upload (no Buildkite agent available). " - "Logs remain in test_workdir/ for local inspection." - ) + LOGGER.info("Artifact ready for collection: %s", file) def create_node_database_archive(config: DbSyncConfig) -> pl.Path: @@ -112,24 +73,6 @@ def create_node_database_archive(config: DbSyncConfig) -> pl.Path: return node_db_archive -def set_buildkite_meta_data(key: str, value: tp.Any) -> None: - """Set metadata in Buildkite for the specified key and value.""" - cmd = ["buildkite-agent", "meta-data", "set", f"{key}", f"{value}"] - subprocess.run(cmd, check=True, timeout=15) - - -def get_buildkite_meta_data(key: str) -> str: - """Retrieve metadata from Buildkite for the specified key.""" - result = subprocess.run( - ["buildkite-agent", "meta-data", "get", f"{key}"], - capture_output=True, - text=True, - timeout=15, - check=True, - ) - return result.stdout.strip() - - def emergency_upload_artifacts( config: DbSyncConfig, perf_stats: list[dict], era_activation: list[dict] | None = None ) -> None: @@ -137,9 +80,8 @@ def emergency_upload_artifacts( Writes enriched perf stats and epoch sync times to disk. The JSON files are picked up by test_upload_ci_artifacts and included in sync_results.zip. No - zips are created here, no Buildkite uploads are performed, and no processes - are terminated — artifact collection is via artifact_paths only; teardown - owns process lifecycle. + zips are created here, no CI uploads are performed, and no processes are + terminated; teardown owns process lifecycle. Args: config: A DbSyncConfig instance with paths and settings. diff --git a/sync_tests/utils/db_sync/__init__.py b/sync_tests/utils/db_sync/__init__.py index 7030a825..72bf0775 100755 --- a/sync_tests/utils/db_sync/__init__.py +++ b/sync_tests/utils/db_sync/__init__.py @@ -375,7 +375,7 @@ def wait_for_db_to_sync( if db_sync_progress is None: LOGGER.info("db-sync hasn't started syncing yet, waiting for initial progress...") db_sync_progress = 0.0 - buildkite_timeout_in_sec = 1828000 + max_sync_timeout_in_sec = 1828000 counter = 0 rollback_counter = 0 @@ -389,7 +389,7 @@ def wait_for_db_to_sync( _emergency_uploaded = False while db_sync_progress < sync_percentage: sync_time_in_sec = time.perf_counter() - start_sync - if sync_time_in_sec + 5 * ONE_MINUTE > buildkite_timeout_in_sec: + if sync_time_in_sec + 5 * ONE_MINUTE > max_sync_timeout_in_sec: era_activation = postgres.get_era_activation_data(config) artifacts.emergency_upload_artifacts(config, perf_stats, era_activation) _emergency_uploaded = True @@ -674,16 +674,6 @@ def create_node_database_archive(config: DbSyncConfig) -> pl.Path: return artifacts.create_node_database_archive(config) -def set_buildkite_meta_data(key: str, value: tp.Any) -> None: - """Set Buildkite metadata for the specified key and value.""" - artifacts.set_buildkite_meta_data(key, value) - - -def get_buildkite_meta_data(key: str) -> str: - """Retrieve Buildkite metadata for the specified key.""" - return artifacts.get_buildkite_meta_data(key) - - def get_latest_snapshot_url(env: str, args: tp.Any) -> str: """Retrieve the latest snapshot URL for the specified environment.""" return snapshots.get_latest_snapshot_url(env, args)