diff --git a/.github/workflows/release-cli.yaml b/.github/workflows/release-cli.yaml new file mode 100644 index 0000000..41cfbc4 --- /dev/null +++ b/.github/workflows/release-cli.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 NVIDIA Corporation +# +# Builds and publishes the karta CLI to GitHub Releases. +# Triggered by tags matching kartacli-v* — kept namespace-separate from the +# existing push-artifacts.yaml workflow which fires on v[0-9].[0-9].[0-9]. +# +# The kartacli- prefix is stripped before goreleaser runs (via +# GORELEASER_CURRENT_TAG) because goreleaser's monorepo feature is Pro-only. +# Goreleaser builds artifacts under dist/ with --skip=publish; the GitHub +# Release for the original kartacli- tag is then created via the gh CLI. + +name: Release karta CLI + +on: + push: + tags: + - 'kartacli-v*' + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Sync bundled Karta definitions + run: make sync-cli-definitions + + - name: Compute version + id: ver + run: | + TAG="${GITHUB_REF#refs/tags/}" + VERSION="${TAG#kartacli-}" + echo "tag=${TAG}" >> "$GITHUB_OUTPUT" + echo "version=${VERSION}" >> "$GITHUB_OUTPUT" + + - name: Build artifacts with goreleaser + uses: goreleaser/goreleaser-action@v6 + with: + workdir: cmd/karta + distribution: goreleaser + version: '~> v2' + args: release --clean --skip=publish + env: + GORELEASER_CURRENT_TAG: ${{ steps.ver.outputs.version }} + + - name: Publish GitHub Release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create "${{ steps.ver.outputs.tag }}" \ + cmd/karta/dist/*.tar.gz cmd/karta/dist/*.zip cmd/karta/dist/*_checksums.txt \ + --prerelease \ + --latest=false \ + --title "karta CLI ${{ steps.ver.outputs.tag }}" \ + --generate-notes diff --git a/.gitignore b/.gitignore index 2f7bcd6..b8aad08 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ config/crd/_.yaml *~ .vscode/ .idea +.dori/ diff --git a/Makefile b/Makefile index b352a83..45073e1 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,8 @@ $(LOCALBIN): PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) KARTA_CHART_DIR := $(PROJECT_DIR)/charts/karta KARTA_CRDS_DIR := $(KARTA_CHART_DIR)/crds +KARTA_DEFINITIONS_DIR := $(PROJECT_DIR)/docs/examples +CLI_DEFINITIONS_DIR := $(PROJECT_DIR)/cmd/karta/internal/definitions/community HELM_CHART_VERSION ?= 0.0.1 @@ -32,6 +34,11 @@ PATH := $(abspath $(LOCALBIN)):$(PATH) manifests: controller-gen ## Generate CRD manifests $(CONTROLLER_GEN) crd paths="./pkg/..." output:crd:artifacts:config=$(KARTA_CRDS_DIR) +.PHONY: sync-cli-definitions +sync-cli-definitions: ## Sync canonical Karta definitions into the CLI's embedded bundle + rm -f $(CLI_DEFINITIONS_DIR)/*.yaml + cp $(KARTA_DEFINITIONS_DIR)/*.yaml $(CLI_DEFINITIONS_DIR)/ + .PHONY: generate generate: controller-gen ## Generate DeepCopy methods $(CONTROLLER_GEN) object paths="./..." @@ -61,7 +68,7 @@ lint: fmt-go vet-go lint-go .PHONY: lint .PHONY: validate -validate: generate manifests generate-mocks generate-licenses +validate: generate manifests generate-mocks generate-licenses sync-cli-definitions @git diff --exit-code .PHONY: install-crd @@ -128,6 +135,10 @@ check: download-dependencies validate test lint ##@ Helm +.PHONY: release-snapshot +release-snapshot: ## Build a local snapshot release with goreleaser (no GH publish) + cd cmd/karta && goreleaser release --snapshot --clean --skip=publish + .PHONY: helm-build helm-build: ## Build the helm chart helm package $(KARTA_CHART_DIR) --version $(HELM_CHART_VERSION) --app-version $(HELM_CHART_VERSION) diff --git a/THIRD_PARTY_LICENSES b/THIRD_PARTY_LICENSES index 36ee509..4967b7b 100644 --- a/THIRD_PARTY_LICENSES +++ b/THIRD_PARTY_LICENSES @@ -108,13 +108,13 @@ For complete license texts, please refer to the source repositories or the LICEN ### k8s.io/klog/v2 - Name: k8s.io/klog/v2 -- Version: v2.130.1 -- License: [Apache-2.0](https://github.com/kubernetes/klog/blob/v2.130.1/LICENSE) +- Version: v2.140.0 +- License: [Apache-2.0](https://github.com/kubernetes/klog/blob/v2.140.0/LICENSE) ### k8s.io/kube-openapi/pkg/util - Name: k8s.io/kube-openapi/pkg/util -- Version: v0.0.0-20260127142750-a19766b6e2d4 -- License: [Apache-2.0](https://github.com/kubernetes/kube-openapi/blob/a19766b6e2d4/LICENSE) +- Version: v0.0.0-20260317180543-43fb72c5454a +- License: [Apache-2.0](https://github.com/kubernetes/kube-openapi/blob/43fb72c5454a/LICENSE) ### k8s.io/utils - Name: k8s.io/utils diff --git a/cmd/karta/.gitignore b/cmd/karta/.gitignore new file mode 100644 index 0000000..d5bc331 --- /dev/null +++ b/cmd/karta/.gitignore @@ -0,0 +1,6 @@ +# Staged at release time by .goreleaser.yaml's before-hook +LICENSE +# goreleaser output +dist/ +# local builds of the CLI binary +karta diff --git a/cmd/karta/.goreleaser.yaml b/cmd/karta/.goreleaser.yaml new file mode 100644 index 0000000..3e5cb88 --- /dev/null +++ b/cmd/karta/.goreleaser.yaml @@ -0,0 +1,62 @@ +# Goreleaser config for the karta CLI. +# Triggered by tags matching kartacli-v* (see .github/workflows/release-cli.yaml). +# The kartacli- prefix isolates these releases from the existing +# v[0-9].[0-9].[0-9] tags used for the Karta helm chart / Go module. +version: 2 + +project_name: karta-cli + +# Note on tag prefix: the kartacli- prefix is stripped by the GitHub workflow +# (.github/workflows/release-cli.yaml) via GORELEASER_CURRENT_TAG before +# goreleaser runs, because the goreleaser monorepo feature is Pro-only. + +before: + hooks: + - go mod tidy + - sh -c 'cp ../../LICENSE LICENSE' + +builds: + - id: karta + main: . + binary: karta + env: + - CGO_ENABLED=0 + goos: [darwin, linux, windows] + goarch: [amd64, arm64] + ignore: + - goos: windows + goarch: arm64 + ldflags: + - -s -w + - -X github.com/run-ai/karta/cmd/karta/cmd.version={{.Version}} + - -X github.com/run-ai/karta/cmd/karta/cmd.commit={{.ShortCommit}} + - -X github.com/run-ai/karta/cmd/karta/cmd.date={{.Date}} + +archives: + - id: karta + ids: [karta] + name_template: "karta_{{.Version}}_{{.Os}}_{{.Arch}}" + formats: [tar.gz] + format_overrides: + - goos: windows + formats: [zip] + files: + - LICENSE + - README.md + +checksum: + name_template: 'karta_{{.Version}}_checksums.txt' + +release: + prerelease: auto # auto-marks pre-release when the tag has -alpha/-beta/-rc + name_template: "karta CLI {{.Tag}}" + +changelog: + use: github + sort: asc + filters: + exclude: + - '^docs:' + - '^test:' + - '^chore:' + - '^ci:' diff --git a/cmd/karta/README.md b/cmd/karta/README.md new file mode 100644 index 0000000..696980a --- /dev/null +++ b/cmd/karta/README.md @@ -0,0 +1,64 @@ +karta — workload-aware visibility for Kubernetes AI workloads +============================================================= + +> **Alpha.** The CRD schema, CLI flags, and output format may change +> without notice between releases. Don't build automation against +> karta output yet. + +Install +------- + +Linux / macOS: + + tar -xzf karta_*.tar.gz + sudo mv karta /usr/local/bin/karta + +Windows: extract `karta.exe` from the `.zip` and place it on `PATH`. + +karta uses your existing kubeconfig — no separate setup needed. + +Usage +----- + + karta workload list # current namespace + karta workload list -A # all namespaces + karta workload list -n my-namespace # specific namespace + karta workload tree # hierarchical view of one workload + karta --context my-cluster workload list + +Same kubeconfig flags as kubectl: `--kubeconfig`, `--context`, `-n`, +plus `--color {auto,always,never}`. + +See `karta --help` and `karta workload --help` for the full surface. + +Supported workload kinds +------------------------ + +The CLI ships with built-in Karta definitions for: + + * PyTorchJob (kubeflow.org) + * JobSet (jobset.x-k8s.io) + * RayCluster, RayJob (ray.io) + * MPIJob (kubeflow.org) + * LeaderWorkerSet (leaderworkerset.x-k8s.io) + * InferenceService (serving.kserve.io) + * Service (serving.knative.dev) + * DynamoGraphDeployment (nvidia.com) + * NIMService (apps.nvidia.com) + * Milvus + +To add a new workload kind, write a Karta definition and contribute +it under `docs/examples/` — the CLI bundle is regenerated from there. + +Version & bugs +-------------- + + karta version + +Project: https://github.com/run-ai/karta +Issues: https://github.com/run-ai/karta/issues + +License +------- + +Apache-2.0. See `LICENSE` in this archive. diff --git a/cmd/karta/cmd/root.go b/cmd/karta/cmd/root.go new file mode 100644 index 0000000..a1385b9 --- /dev/null +++ b/cmd/karta/cmd/root.go @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package cmd + +import ( + "io" + + "github.com/spf13/cobra" + "k8s.io/cli-runtime/pkg/genericclioptions" + + "github.com/run-ai/karta/cmd/karta/internal/render" +) + +// styleFor resolves the active render.Style based on --color and the +// writer's TTY status. +func (o *rootOptions) styleFor(w io.Writer) render.Style { + switch o.colorMode { + case "always": + return render.ForceStyle() + case "never": + return render.PlainStyle() + default: + return render.AutoStyle(w) + } +} + +type rootOptions struct { + configFlags *genericclioptions.ConfigFlags + colorMode string +} + +func NewRootCmd() *cobra.Command { + opts := &rootOptions{ + configFlags: genericclioptions.NewConfigFlags(true), + } + + root := &cobra.Command{ + Use: "karta", + Short: "Karta CLI — workload-aware visibility for any Kubernetes AI workload", + Long: `Karta is a CLI that reads Karta workload definitions and renders a unified view +of any Kubernetes AI workload — components, roles, scaling, status, GPU allocation — +across PyTorchJob, RayCluster, JobSet, KServe, and any custom CRD with a Karta definition. + +Same output shape regardless of the underlying CRD.`, + SilenceUsage: true, + SilenceErrors: false, + } + + opts.configFlags.AddFlags(root.PersistentFlags()) + root.PersistentFlags().StringVar(&opts.colorMode, "color", "auto", "Colorize output: auto, always, never") + + root.AddCommand(newWorkloadCmd(opts)) + root.AddCommand(newVersionCmd()) + + return root +} diff --git a/cmd/karta/cmd/version.go b/cmd/karta/cmd/version.go new file mode 100644 index 0000000..3dca458 --- /dev/null +++ b/cmd/karta/cmd/version.go @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +// Build metadata injected via ldflags at release time. Defaults make sense +// for `go build` developer builds. +var ( + version = "dev" + commit = "none" + date = "unknown" +) + +func newVersionCmd() *cobra.Command { + return &cobra.Command{ + Use: "version", + Short: "Print karta version, commit, and build date", + Args: cobra.NoArgs, + Run: func(c *cobra.Command, _ []string) { + fmt.Fprintf(c.OutOrStdout(), "karta %s (commit %s, built %s)\n", version, commit, date) + }, + } +} diff --git a/cmd/karta/cmd/workload.go b/cmd/karta/cmd/workload.go new file mode 100644 index 0000000..aede1e0 --- /dev/null +++ b/cmd/karta/cmd/workload.go @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package cmd + +import ( + "github.com/spf13/cobra" +) + +func newWorkloadCmd(opts *rootOptions) *cobra.Command { + c := &cobra.Command{ + Use: "workload", + Short: "Operate on workloads in the cluster", + } + + c.AddCommand(newWorkloadListCmd(opts)) + c.AddCommand(newWorkloadTreeCmd(opts)) + + return c +} diff --git a/cmd/karta/cmd/workload_list.go b/cmd/karta/cmd/workload_list.go new file mode 100644 index 0000000..df87374 --- /dev/null +++ b/cmd/karta/cmd/workload_list.go @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package cmd + +import ( + "fmt" + "sort" + "time" + + "github.com/spf13/cobra" + + "github.com/run-ai/karta/cmd/karta/internal/definitions" + "github.com/run-ai/karta/cmd/karta/internal/kube" + "github.com/run-ai/karta/cmd/karta/internal/loader" + "github.com/run-ai/karta/cmd/karta/internal/render" + "github.com/run-ai/karta/pkg/tree" +) + +func newWorkloadListCmd(opts *rootOptions) *cobra.Command { + var allNamespaces bool + c := &cobra.Command{ + Use: "list", + Short: "List workloads discovered via known Karta definitions", + RunE: func(c *cobra.Command, _ []string) error { + ctx := c.Context() + + client, err := kube.NewClient(opts.configFlags) + if err != nil { + return fmt.Errorf("failed to build kube client: %w", err) + } + + registry, err := definitions.Load() + if err != nil { + return fmt.Errorf("load community Karta definitions: %w", err) + } + + ns := client.Namespace() + if allNamespaces { + ns = "" + } + workloads, pods, err := loader.ListWorkloads(ctx, client, registry, ns) + if err != nil { + return err + } + + rows := make([]render.ListRow, 0, len(workloads)) + for _, w := range workloads { + wt, err := tree.Build(ctx, w.Karta, w.Workload, pods, tree.JQMatcher{}) + if err != nil { + return fmt.Errorf("build tree for %s/%s: %w", w.Workload.GetKind(), w.Workload.GetName(), err) + } + view := render.Build(wt, w.Workload.GetKind(), w.Workload.GetName(), w.Workload.GetNamespace()) + rows = append(rows, render.ListRow{ + Namespace: w.Workload.GetNamespace(), + Name: w.Workload.GetName(), + Kind: w.Workload.GetKind(), + Phases: view.Phases, + Components: render.SummarizeComponents(view), + GPU: render.TotalGPUs(view), + Age: time.Since(w.Workload.GetCreationTimestamp().Time), + }) + } + + sort.SliceStable(rows, func(i, j int) bool { + if rows[i].Namespace != rows[j].Namespace { + return rows[i].Namespace < rows[j].Namespace + } + return rows[i].Name < rows[j].Name + }) + return render.List(c.OutOrStdout(), rows, opts.styleFor(c.OutOrStdout())) + }, + } + c.Flags().BoolVarP(&allNamespaces, "all-namespaces", "A", false, "List workloads across every namespace") + return c +} diff --git a/cmd/karta/cmd/workload_tree.go b/cmd/karta/cmd/workload_tree.go new file mode 100644 index 0000000..bc72308 --- /dev/null +++ b/cmd/karta/cmd/workload_tree.go @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + + "github.com/run-ai/karta/cmd/karta/internal/definitions" + "github.com/run-ai/karta/cmd/karta/internal/kube" + "github.com/run-ai/karta/cmd/karta/internal/loader" + "github.com/run-ai/karta/cmd/karta/internal/render" + "github.com/run-ai/karta/pkg/tree" +) + +func newWorkloadTreeCmd(opts *rootOptions) *cobra.Command { + return &cobra.Command{ + Use: "tree ", + Short: "Render a workload as a hierarchical tree", + Args: cobra.ExactArgs(1), + RunE: func(c *cobra.Command, args []string) error { + ctx := c.Context() + name := args[0] + + client, err := kube.NewClient(opts.configFlags) + if err != nil { + return fmt.Errorf("failed to build kube client: %w", err) + } + + registry, err := definitions.Load() + if err != nil { + return fmt.Errorf("load community Karta definitions: %w", err) + } + + res, err := loader.FindWorkload(ctx, client, registry, client.Namespace(), name) + if err != nil { + return err + } + + wt, err := tree.Build(ctx, res.Karta, res.Workload, res.Pods, tree.JQMatcher{}) + if err != nil { + return fmt.Errorf("build workload tree: %w", err) + } + + view := render.Build(wt, res.Workload.GetKind(), res.Workload.GetName(), res.Workload.GetNamespace()) + return render.Tree(c.OutOrStdout(), view, opts.styleFor(c.OutOrStdout())) + }, + } +} diff --git a/cmd/karta/go.mod b/cmd/karta/go.mod new file mode 100644 index 0000000..592bd3a --- /dev/null +++ b/cmd/karta/go.mod @@ -0,0 +1,71 @@ +module github.com/run-ai/karta/cmd/karta + +go 1.25.9 + +replace github.com/run-ai/karta => ../.. + +require ( + github.com/run-ai/karta v0.0.0-00010101000000-000000000000 + github.com/spf13/cobra v1.10.2 + golang.org/x/term v0.43.0 + k8s.io/api v0.35.1 + k8s.io/apimachinery v0.35.1 + k8s.io/cli-runtime v0.35.1 + k8s.io/client-go v0.35.1 + sigs.k8s.io/yaml v1.6.0 +) + +require ( + github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-errors/errors v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/itchyny/gojq v0.12.18 // indirect + github.com/itchyny/timefmt-go v0.1.7 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/term v0.5.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/peterbourgon/diskv v2.0.1+incompatible // indirect + github.com/samber/lo v1.52.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/x448/float16 v0.8.4 // indirect + github.com/xlab/treeprint v1.2.0 // indirect + go.uber.org/mock v0.6.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.51.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.44.0 // indirect + golang.org/x/text v0.34.0 // indirect + golang.org/x/time v0.14.0 // indirect + google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.140.0 // indirect + k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect + k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/kustomize/api v0.21.1 // indirect + sigs.k8s.io/kustomize/kyaml v0.21.1 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect +) diff --git a/cmd/karta/go.sum b/cmd/karta/go.sum new file mode 100644 index 0000000..10b77b5 --- /dev/null +++ b/cmd/karta/go.sum @@ -0,0 +1,179 @@ +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= +github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20260202012954-cb029daf43ef h1:xpF9fUHpoIrrjX24DURVKiwHcFpw19ndIs+FwTSMbno= +github.com/google/pprof v0.0.0-20260202012954-cb029daf43ef/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/itchyny/gojq v0.12.18 h1:gFGHyt/MLbG9n6dqnvlliiya2TaMMh6FFaR2b1H6Drc= +github.com/itchyny/gojq v0.12.18/go.mod h1:4hPoZ/3lN9fDL1D+aK7DY1f39XZpY9+1Xpjz8atrEkg= +github.com/itchyny/timefmt-go v0.1.7 h1:xyftit9Tbw+Dc/huSSPJaEmX1TVL8lw5vxjJLK4GMMA= +github.com/itchyny/timefmt-go v0.1.7/go.mod h1:5E46Q+zj7vbTgWY8o5YkMeYb4I6GeWLFnetPy5oBrAI= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= +github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= +github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/samber/lo v1.52.0 h1:Rvi+3BFHES3A8meP33VPAxiBZX/Aws5RxrschYGjomw= +github.com/samber/lo v1.52.0/go.mod h1:4+MXEGsJzbKGaUEQFKBq2xtfuznW9oz/WrgyzMzRoM0= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= +github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= +github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.6.0 h1:hyF9dfmbgIX5EfOdasqLsWD6xqpNZlXblLB/Dbnwv3Y= +go.uber.org/mock v0.6.0/go.mod h1:KiVJ4BqZJaMj4svdfmHM0AUx4NJYO8ZNpPnZn1Z+BBU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= +golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af h1:+5/Sw3GsDNlEmu7TfklWKPdQ0Ykja5VEmq2i817+jbI= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.1 h1:0PO/1FhlK/EQNVK5+txc4FuhQibV25VLSdLMmGpDE/Q= +k8s.io/api v0.35.1/go.mod h1:28uR9xlXWml9eT0uaGo6y71xK86JBELShLy4wR1XtxM= +k8s.io/apimachinery v0.35.1 h1:yxO6gV555P1YV0SANtnTjXYfiivaTPvCTKX6w6qdDsU= +k8s.io/apimachinery v0.35.1/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/cli-runtime v0.35.1 h1:uKcXFe8J7AMAM4Gm2JDK4mp198dBEq2nyeYtO+JfGJE= +k8s.io/cli-runtime v0.35.1/go.mod h1:55/hiXIq1C8qIJ3WBrWxEwDLdHQYhBNRdZOz9f7yvTw= +k8s.io/client-go v0.35.1 h1:+eSfZHwuo/I19PaSxqumjqZ9l5XiTEKbIaJ+j1wLcLM= +k8s.io/client-go v0.35.1/go.mod h1:1p1KxDt3a0ruRfc/pG4qT/3oHmUj1AhSHEcxNSGg+OA= +k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= +k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/kustomize/api v0.21.1 h1:lzqbzvz2CSvsjIUZUBNFKtIMsEw7hVLJp0JeSIVmuJs= +sigs.k8s.io/kustomize/api v0.21.1/go.mod h1:f3wkKByTrgpgltLgySCntrYoq5d3q7aaxveSagwTlwI= +sigs.k8s.io/kustomize/kyaml v0.21.1 h1:IVlbmhC076nf6foyL6Taw4BkrLuEsXUXNpsE+ScX7fI= +sigs.k8s.io/kustomize/kyaml v0.21.1/go.mod h1:hmxADesM3yUN2vbA5z1/YTBnzLJ1dajdqpQonwBL1FQ= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/cmd/karta/internal/definitions/community/dynamo.yaml b/cmd/karta/internal/definitions/community/dynamo.yaml new file mode 100644 index 0000000..a005090 --- /dev/null +++ b/cmd/karta/internal/definitions/community/dynamo.yaml @@ -0,0 +1,74 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: nvidia-com-dynamographdeployment-v1alpha1 +spec: + structureDefinition: + rootComponent: + name: dynamographdeployment + kind: + group: nvidia.com + version: v1alpha1 + kind: DynamoGraphDeployment + statusDefinition: + phaseDefinition: + path: .status.state + statusMappings: + initializing: + - byPhase: pending + running: + - byPhase: successful + failed: + - byPhase: failed + childComponents: + - name: service + ownerRef: dynamographdeployment + specDefinition: + fragmentedPodSpecDefinition: + schedulerNamePath: .spec.services | .[] | .extraPodSpec.schedulerName + labelsPath: .spec.services | .[] | .labels + annotationsPath: .spec.services | .[] | .annotations + resourcesPath: .spec.services | .[] | .resources + resourceClaimsPath: .spec.services | .[] | .extraPodSpec.resourceClaims + podAffinityPath: .spec.services | .[] | .extraPodSpec.affinity.podAffinity + nodeAffinityPath: .spec.services | .[] | .extraPodSpec.affinity.nodeAffinity + containerPath: .spec.services | .[] | .extraPodSpec.mainContainer + priorityClassNamePath: .spec.services | .[] | .extraPodSpec.priorityClassName + imagePath: .spec.services | .[] | .extraPodSpec.mainContainer.image + scaleDefinition: + replicasPath: .spec.services[] | (.replicas // 1) * (.multinode.nodeCount // 1) + minReplicasPath: .spec.services | .[] | .autoscaling.minReplicas + maxReplicasPath: .spec.services | .[] | .autoscaling.maxReplicas + instanceIdPath: .spec.services | to_entries[] | .key + podSelector: + componentInstanceSelector: + idPath: .metadata.labels["nvidia.com/dynamo-component"] + replicaSelector: + keyPath: .metadata.labels["grove.io/podcliquescalinggroup-replica-index"] // .metadata.labels["leaderworkerset.sigs.k8s.io/group-index"] + additionalChildKinds: + - group: nvidia.com + version: v1alpha1 + kind: DynamoComponentDeployment + - group: leaderworkerset.x-k8s.io + version: v1 + kind: LeaderWorkerSet + - group: scheduler.grove.io + version: v1alpha1 + kind: PodGang + - group: grove.io + version: v1alpha1 + kind: PodClique + - group: grove.io + version: v1alpha1 + kind: PodCliqueSet + - group: grove.io + version: v1alpha1 + kind: PodCliqueScalingGroup + optimizationInstructions: + gangScheduling: + podGroups: + - name: service + members: + - componentName: service + groupByKeyPaths: + - .metadata.labels["nvidia.com/dynamo-component"] diff --git a/cmd/karta/internal/definitions/community/jobset.yaml b/cmd/karta/internal/definitions/community/jobset.yaml new file mode 100644 index 0000000..5d2af4e --- /dev/null +++ b/cmd/karta/internal/definitions/community/jobset.yaml @@ -0,0 +1,65 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +spec: + structureDefinition: + rootComponent: + name: jobset + # specPath omitted for root component (defaults to .spec) + kind: + group: jobset.x-k8s.io + version: v1alpha2 + kind: JobSet + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + statusMappings: + initializing: + - byConditions: + - type: StartupPolicyInProgress + status: "True" + running: + - byConditions: + - type: StartupPolicyCompleted + status: "True" + - type: Completed + status: "False" + - type: Failed + status: "False" + completed: + - byConditions: + - type: Completed + status: "True" + failed: + - byConditions: + - type: Failed + status: "True" + + childComponents: + # ReplicatedJob - represents the actual Job resources created by JobSet + - name: replicatedjob + kind: + group: batch + version: v1 + kind: Job + ownerRef: jobset + specDefinition: + podTemplateSpecPath: .spec.replicatedJobs[].template.spec.template + scaleDefinition: + replicasPath: .spec.replicatedJobs[].replicas + instanceIdPath: .spec.replicatedJobs[].name + podSelector: + componentInstanceSelector: + idPath: .metadata.labels["jobset.sigs.k8s.io/replicatedjob-name"] + + # Optimization instructions + optimizationInstructions: + # Gang scheduling for coordinated job execution + gangScheduling: + podGroups: + - name: job + members: + - componentName: replicatedjob + groupByKeyPaths: + - .metadata.labels["jobset.sigs.k8s.io/replicatedjob-name"] \ No newline at end of file diff --git a/cmd/karta/internal/definitions/community/knative-serving.yaml b/cmd/karta/internal/definitions/community/knative-serving.yaml new file mode 100644 index 0000000..4a67008 --- /dev/null +++ b/cmd/karta/internal/definitions/community/knative-serving.yaml @@ -0,0 +1,47 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: serving-knative-dev-service-v1 +spec: + structureDefinition: + rootComponent: + name: knativeservice + kind: + group: serving.knative.dev + version: v1 + kind: Service + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + running: + - byConditions: + - type: Ready + status: "True" + childComponents: + - name: revision + kind: + group: serving.knative.dev + version: v1 + kind: Revision + ownerRef: knativeservice + specDefinition: + podTemplateSpecPath: .spec.template + scaleDefinition: + minReplicasPath: .spec.template.metadata.annotations["autoscaling.knative.dev/min-scale"] // 1 + maxReplicasPath: .spec.template.metadata.annotations["autoscaling.knative.dev/max-scale"] + additionalChildKinds: + - group: apps + version: v1 + kind: Deployment + optimizationInstructions: + gangScheduling: + podGroups: + - name: revision + members: + - componentName: revision + groupByKeyPaths: + - .metadata.labels["serving.knative.dev/revision"] diff --git a/cmd/karta/internal/definitions/community/kserve.yaml b/cmd/karta/internal/definitions/community/kserve.yaml new file mode 100644 index 0000000..7c41181 --- /dev/null +++ b/cmd/karta/internal/definitions/community/kserve.yaml @@ -0,0 +1,86 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: serving-kserve-io-inferenceservice-v1beta1 +spec: + structureDefinition: + rootComponent: + name: inferenceservice + kind: + group: serving.kserve.io + version: v1beta1 + kind: InferenceService + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + running: + - byConditions: + - type: PredictorReady + status: "True" + - type: RoutesReady + status: "True" + - type: LatestDeploymentReady + status: "True" + failed: + - byConditions: + - type: PredictorReady + status: "False" + - type: PredictorConfigurationReady + status: "False" + - type: RoutesReady + status: "False" + childComponents: + - name: predictor + kind: + group: apps + version: v1 + kind: Deployment + ownerRef: inferenceservice + specDefinition: + fragmentedPodSpecDefinition: + schedulerNamePath: .spec.predictor.schedulerName + labelsPath: .spec.predictor.labels + annotationsPath: .spec.predictor.annotations + podAffinityPath: .spec.predictor.affinity.podAffinity + nodeAffinityPath: .spec.predictor.affinity.nodeAffinity + containersPath: .spec.predictor.containers + containerPath: .spec.predictor | ( (.[]? | select(type =="object" and .storageUri ))) + priorityClassNamePath: .spec.predictor.priorityClassName + scaleDefinition: + minReplicasPath: .spec.predictor.minReplicas + maxReplicasPath: .spec.predictor.maxReplicas + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["component"] + value: predictor + - name: transformer + kind: + group: apps + version: v1 + kind: Deployment + ownerRef: inferenceservice + specDefinition: + podSpecPath: .spec.transformer + metadataPath: .spec.transformer + scaleDefinition: + minReplicasPath: .spec.transformer.minReplicas + maxReplicasPath: .spec.transformer.maxReplicas + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["component"] + value: transformer + optimizationInstructions: + gangScheduling: + podGroups: + - name: service + members: + - componentName: predictor + groupByKeyPaths: + - .metadata.labels["serving.kserve.io/inferenceservice"] + - componentName: transformer + groupByKeyPaths: + - .metadata.labels["serving.kserve.io/inferenceservice"] diff --git a/cmd/karta/internal/definitions/community/lws.yaml b/cmd/karta/internal/definitions/community/lws.yaml new file mode 100644 index 0000000..66b0a92 --- /dev/null +++ b/cmd/karta/internal/definitions/community/lws.yaml @@ -0,0 +1,87 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: leaderworkerset-x-k8s-io-leaderworkerset-v1 +spec: + structureDefinition: + rootComponent: + name: leaderworkerset + kind: + group: leaderworkerset.x-k8s.io + version: v1 + kind: LeaderWorkerSet + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + initializing: + - byConditions: + - type: Available + status: "False" + - type: Progressing + status: "False" + - type: UpdateInProgress + status: "False" + running: + - byConditions: + - type: Available + status: "True" + - type: Progressing + status: "False" + - type: UpdateInProgress + status: "False" + failed: + - byConditions: + - type: Available + status: "False" + - type: Progressing + status: "False" + - type: UpdateInProgress + status: "False" + childComponents: + - name: group + ownerRef: leaderworkerset + scaleDefinition: + replicasPath: .spec.leaderWorkerTemplate.size + podSelector: + replicaSelector: + keyPath: .metadata.labels["leaderworkerset.sigs.k8s.io/group-index"] + - name: leader + kind: + group: apps + version: v1 + kind: StatefulSet + ownerRef: group + specDefinition: + podTemplateSpecPath: .spec.leaderWorkerTemplate.leaderTemplate + scaleDefinition: + replicasPath: .spec.replicas // 1 + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["leaderworkerset.sigs.k8s.io/worker-index"] + value: "0" + - name: worker + kind: + group: apps + version: v1 + kind: StatefulSet + ownerRef: group + specDefinition: + podTemplateSpecPath: .spec.leaderWorkerTemplate.workerTemplate + scaleDefinition: + replicasPath: (.spec.replicas // 1) * ((.spec.leaderWorkerTemplate.size // 1) - 1) + podSelector: + componentTypeSelector: + keyPath: .metadata.annotations["leaderworkerset.sigs.k8s.io/leader-name"] + optimizationInstructions: + gangScheduling: + podGroups: + - name: group + members: + - componentName: group + groupByKeyPaths: + - .metadata.labels["leaderworkerset.sigs.k8s.io/name"] + - .metadata.labels["leaderworkerset.sigs.k8s.io/group-index"] // "0" diff --git a/cmd/karta/internal/definitions/community/milvus.yaml b/cmd/karta/internal/definitions/community/milvus.yaml new file mode 100644 index 0000000..600387b --- /dev/null +++ b/cmd/karta/internal/definitions/community/milvus.yaml @@ -0,0 +1,96 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +spec: + structureDefinition: + rootComponent: + name: milvus + # specPath omitted for root component (defaults to .spec) + kind: + group: milvus.io + version: v1beta1 + kind: Milvus + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + statusMappings: + initializing: + - byConditions: + - type: EtcdReady + status: "False" + - type: StorageReady + status: "False" + - type: MsgStreamReady + status: "False" + - type: MilvusReady + status: "False" + running: + - byConditions: + - type: EtcdReady + status: "True" + - type: StorageReady + status: "True" + - type: MsgStreamReady + status: "True" + - type: MilvusReady + status: "True" + failed: + - byConditions: + - type: MilvusReady + status: "False" + - type: EtcdReady + status: "False" + - type: StorageReady + status: "False" + - type: MsgStreamReady + status: "False" + + childComponents: + # QueryNode - handles vector search queries + - name: querynode + kind: + group: apps + version: v1 + kind: StatefulSet + ownerRef: milvus + specDefinition: + podTemplateSpecPath: .spec.components.queryNode.template + scaleDefinition: + replicasPath: .spec.components.queryNode.replicas + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["app.kubernetes.io/component"] + value: querynode + + # DataNode - processes and stores vector data + - name: datanode + kind: + group: apps + version: v1 + kind: StatefulSet + ownerRef: milvus + specDefinition: + podTemplateSpecPath: .spec.components.dataNode.template + scaleDefinition: + replicasPath: .spec.components.dataNode.replicas + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["app.kubernetes.io/component"] + value: datanode + + # Optimization instructions + optimizationInstructions: + # Gang scheduling for Milvus component coordination + gangScheduling: + podGroups: + - name: cluster + members: + - componentName: querynode + groupByKeyPaths: + - .metadata.labels["app.kubernetes.io/instance"] + - componentName: datanode + groupByKeyPaths: + - .metadata.labels["app.kubernetes.io/instance"] + + \ No newline at end of file diff --git a/cmd/karta/internal/definitions/community/mpijob.yaml b/cmd/karta/internal/definitions/community/mpijob.yaml new file mode 100644 index 0000000..5f4c960 --- /dev/null +++ b/cmd/karta/internal/definitions/community/mpijob.yaml @@ -0,0 +1,84 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: kubeflow-org-mpijob-v1 +spec: + structureDefinition: + rootComponent: + name: mpijob + kind: + group: kubeflow.org + version: v1 + kind: MPIJob + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + initializing: + - byConditions: + - type: Created + status: "True" + - type: Running + status: "False" + running: + - byConditions: + - type: Running + status: "True" + - type: Succeeded + status: "False" + - type: Failed + status: "False" + completed: + - byConditions: + - type: Succeeded + status: "True" + failed: + - byConditions: + - type: Failed + status: "True" + childComponents: + - name: launcher + kind: + group: "" + version: v1 + kind: Pod + ownerRef: mpijob + specDefinition: + podTemplateSpecPath: .spec.mpiReplicaSpecs.Launcher.template + scaleDefinition: + replicasPath: .spec.mpiReplicaSpecs.Launcher.replicas // 1 + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["training.kubeflow.org/replica-type"] + value: launcher + - name: worker + kind: + group: "" + version: v1 + kind: Pod + ownerRef: mpijob + specDefinition: + podTemplateSpecPath: .spec.mpiReplicaSpecs.Worker.template + scaleDefinition: + replicasPath: .spec.mpiReplicaSpecs.Worker.replicas // 1 + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["training.kubeflow.org/replica-type"] + value: worker + + # Optimization instructions + optimizationInstructions: + # Gang scheduling for distributed training coordination + gangScheduling: + podGroups: + - name: job + members: + - componentName: launcher + groupByKeyPaths: + - .metadata.labels["training.kubeflow.org/job-name"] + - componentName: worker + groupByKeyPaths: + - .metadata.labels["training.kubeflow.org/job-name"] diff --git a/cmd/karta/internal/definitions/community/nimservice.yaml b/cmd/karta/internal/definitions/community/nimservice.yaml new file mode 100644 index 0000000..a4e3ded --- /dev/null +++ b/cmd/karta/internal/definitions/community/nimservice.yaml @@ -0,0 +1,47 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: apps-nvidia-com-nimservice-v1alpha1 +spec: + structureDefinition: + rootComponent: + name: nimservice + kind: + group: apps.nvidia.com + version: v1alpha1 + kind: NIMService + specDefinition: + fragmentedPodSpecDefinition: + schedulerNamePath: .spec.schedulerName + labelsPath: .spec.labels + annotationsPath: .spec.annotations + resourcesPath: .spec.resources + podAffinityPath: .spec.affinity.podAffinity + nodeAffinityPath: .spec.affinity.nodeAffinity + scaleDefinition: + replicasPath: .spec.replicas // 1 + statusDefinition: + phaseDefinition: + path: .status.state + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + initializing: + - byPhase: NotReady + running: + - byPhase: Ready + failed: + - byPhase: Failed + # Optimization instructions using new structure + optimizationInstructions: + # Gang scheduling for inference pipeline coordination (preferred for serving) + gangScheduling: + podGroups: + - name: service + members: + - componentName: nimservice + groupByKeyPaths: + - .metadata.labels["app.kubernetes.io/name"] diff --git a/cmd/karta/internal/definitions/community/pytorch.yaml b/cmd/karta/internal/definitions/community/pytorch.yaml new file mode 100644 index 0000000..1f06935 --- /dev/null +++ b/cmd/karta/internal/definitions/community/pytorch.yaml @@ -0,0 +1,93 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: kubeflow-org-pytorchjob-v1 +spec: + structureDefinition: + rootComponent: + name: pytorchjob + kind: + group: kubeflow.org + version: v1 + kind: PyTorchJob + # suspendDefinition sets spec.suspend to pause/resume the job natively. + suspendDefinition: + suspendActions: + - path: .spec.suspend + value: "true" + resumeActions: + - path: .spec.suspend + value: "false" + statusDefinition: + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + statusMappings: + initializing: + - byConditions: + - type: Created + status: "True" + running: + - byConditions: + - type: Running + status: "True" + completed: + - byConditions: + - type: Succeeded + status: "True" + failed: + - byConditions: + - type: Failed + status: "True" + suspended: + - byConditions: + - type: Suspended + status: "True" + suspending: + - byConditions: + - type: Suspending + status: "True" + childComponents: + - name: master + kind: + group: "" + version: v1 + kind: Pod + ownerRef: pytorchjob + specDefinition: + podTemplateSpecPath: .spec.pytorchReplicaSpecs.Master.template + scaleDefinition: + replicasPath: .spec.pytorchReplicaSpecs.Master.replicas // 1 + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["training.kubeflow.org/replica-type"] + value: master + - name: worker + kind: + group: "" + version: v1 + kind: Pod + ownerRef: pytorchjob + specDefinition: + podTemplateSpecPath: .spec.pytorchReplicaSpecs.Worker.template + scaleDefinition: + replicasPath: .spec.pytorchReplicaSpecs.Worker.replicas // 1 + minReplicasPath: .spec.elasticPolicy.minReplicas + maxReplicasPath: .spec.elasticPolicy.maxReplicas + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["training.kubeflow.org/replica-type"] + value: worker + optimizationInstructions: + gangScheduling: + podGroups: + - name: job + members: + - componentName: master + groupByKeyPaths: + - .metadata.labels["training.kubeflow.org/job-name"] + - componentName: worker + groupByKeyPaths: + - .metadata.labels["training.kubeflow.org/job-name"] diff --git a/cmd/karta/internal/definitions/community/raycluster.yaml b/cmd/karta/internal/definitions/community/raycluster.yaml new file mode 100644 index 0000000..68a3f9a --- /dev/null +++ b/cmd/karta/internal/definitions/community/raycluster.yaml @@ -0,0 +1,65 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: ray-io-raycluster-v1 +spec: + structureDefinition: + rootComponent: + name: raycluster + kind: + group: ray.io + version: v1 + kind: RayCluster + statusDefinition: + phaseDefinition: + path: .status.state + statusMappings: + running: + - byPhase: ready + failed: + - byPhase: failed + childComponents: + - name: head + kind: + group: "" + version: v1 + kind: Pod + ownerRef: raycluster + specDefinition: + podTemplateSpecPath: .spec.headGroupSpec.template + scaleDefinition: + replicasPath: "1" + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["ray.io/node-type"] + value: head + - name: worker + kind: + group: "" + version: v1 + kind: Pod + ownerRef: raycluster + specDefinition: + podTemplateSpecPath: .spec.workerGroupSpecs[].template + scaleDefinition: + replicasPath: .spec.workerGroupSpecs[].replicas + minReplicasPath: .spec.workerGroupSpecs[].minReplicas + maxReplicasPath: .spec.workerGroupSpecs[].maxReplicas + instanceIdPath: .spec.workerGroupSpecs[].groupName + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["ray.io/node-type"] + value: worker + componentInstanceSelector: + idPath: .metadata.labels["ray.io/group"] + optimizationInstructions: + gangScheduling: + podGroups: + - name: cluster + members: + - componentName: head + groupByKeyPaths: + - .metadata.labels["ray.io/cluster"] + - componentName: worker + groupByKeyPaths: + - .metadata.labels["ray.io/cluster"] diff --git a/cmd/karta/internal/definitions/community/rayjob.yaml b/cmd/karta/internal/definitions/community/rayjob.yaml new file mode 100644 index 0000000..e08fed4 --- /dev/null +++ b/cmd/karta/internal/definitions/community/rayjob.yaml @@ -0,0 +1,73 @@ +apiVersion: run.ai/v1alpha1 +kind: Karta +metadata: + name: ray-io-rayjob-v1 +spec: + structureDefinition: + rootComponent: + name: rayjob + kind: + group: ray.io + version: v1 + kind: RayJob + statusDefinition: + phaseDefinition: + path: .status.jobStatus + conditionsDefinition: + path: .status.conditions + typeFieldName: type + statusFieldName: status + messageFieldName: message + reasonFieldName: null + statusMappings: + initializing: + - byPhase: PENDING + running: + - byPhase: RUNNING + completed: + - byPhase: SUCCEEDED + failed: + - byPhase: FAILED + childComponents: + - name: head + kind: + group: "" + version: v1 + kind: Pod + ownerRef: rayjob + specDefinition: + podTemplateSpecPath: .spec.rayClusterSpec.headGroupSpec.template + scaleDefinition: + replicasPath: "1" + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["ray.io/node-type"] + value: head + - name: worker + kind: + group: "" + version: v1 + kind: Pod + ownerRef: rayjob + specDefinition: + podTemplateSpecPath: .spec.rayClusterSpec.workerGroupSpecs[].template + scaleDefinition: + replicasPath: .spec.rayClusterSpec.workerGroupSpecs[].replicas // 1 + instanceIdPath: .spec.workerGroupSpecs[].groupName + podSelector: + componentTypeSelector: + keyPath: .metadata.labels["ray.io/node-type"] + value: worker + componentInstanceSelector: + idPath: .metadata.labels["ray.io/group"] + optimizationInstructions: + gangScheduling: + podGroups: + - name: job + members: + - componentName: head + groupByKeyPaths: + - .metadata.labels["ray.io/cluster"] + - componentName: worker + groupByKeyPaths: + - .metadata.labels["ray.io/cluster"] diff --git a/cmd/karta/internal/definitions/registry.go b/cmd/karta/internal/definitions/registry.go new file mode 100644 index 0000000..5d66545 --- /dev/null +++ b/cmd/karta/internal/definitions/registry.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +// Package definitions ships the community Karta definitions that the karta +// CLI knows about out of the box. Each definition is embedded at build time +// and indexed by the workload GroupVersionKind it targets, so commands like +// `karta workload tree` can resolve a definition from the workload object's +// GVK without touching the cluster. +package definitions + +import ( + "embed" + "fmt" + "io/fs" + "strings" + "sync" + + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/yaml" + + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" +) + +//go:embed community/*.yaml +var communityFS embed.FS + +// Registry holds the parsed community definitions, keyed by the GVK they target. +type Registry struct { + byGVK map[schema.GroupVersionKind]*v1alpha1.Karta +} + +// Lookup returns the community definition for gvk, or nil when none is known. +func (r *Registry) Lookup(gvk schema.GroupVersionKind) *v1alpha1.Karta { + if r == nil { + return nil + } + return r.byGVK[gvk] +} + +// All returns every loaded community definition. +func (r *Registry) All() []*v1alpha1.Karta { + out := make([]*v1alpha1.Karta, 0, len(r.byGVK)) + for _, k := range r.byGVK { + out = append(out, k) + } + return out +} + +var ( + once sync.Once + loaded *Registry + loadErr error +) + +// Load parses every embedded community definition and indexes it by GVK. +// The result is cached for the lifetime of the process. +func Load() (*Registry, error) { + once.Do(func() { + reg := &Registry{byGVK: map[schema.GroupVersionKind]*v1alpha1.Karta{}} + err := fs.WalkDir(communityFS, "community", func(path string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if d.IsDir() || !strings.HasSuffix(path, ".yaml") { + return nil + } + data, err := communityFS.ReadFile(path) + if err != nil { + return fmt.Errorf("read %s: %w", path, err) + } + k := &v1alpha1.Karta{} + if err := yaml.Unmarshal(data, k); err != nil { + return fmt.Errorf("parse %s: %w", path, err) + } + gvk := rootGVK(k) + if gvk.Kind == "" { + return fmt.Errorf("%s: root component has no kind", path) + } + reg.byGVK[gvk] = k + return nil + }) + if err != nil { + loadErr = err + return + } + loaded = reg + }) + return loaded, loadErr +} + +func rootGVK(k *v1alpha1.Karta) schema.GroupVersionKind { + root := k.Spec.StructureDefinition.RootComponent + if root.Kind == nil { + return schema.GroupVersionKind{} + } + return schema.GroupVersionKind{ + Group: root.Kind.Group, + Version: root.Kind.Version, + Kind: root.Kind.Kind, + } +} diff --git a/cmd/karta/internal/kube/client.go b/cmd/karta/internal/kube/client.go new file mode 100644 index 0000000..d8ef8bf --- /dev/null +++ b/cmd/karta/internal/kube/client.go @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +// Package kube wires the standard kubectl-style configuration flags into +// the dynamic client and REST mapper that the karta CLI uses to read +// workload objects and pods. It deliberately wraps genericclioptions so +// every command sees the same kubeconfig / context / namespace plumbing. +package kube + +import ( + "fmt" + + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/restmapper" +) + +// Client bundles the kube clients the karta CLI needs. +type Client struct { + config *rest.Config + dynamic dynamic.Interface + core kubernetes.Interface + mapper *restmapper.DeferredDiscoveryRESTMapper + namespace string +} + +// NewClient resolves kubeconfig + context + namespace from standard kubectl flags +// and returns a Client ready to read workloads and pods. +func NewClient(flags *genericclioptions.ConfigFlags) (*Client, error) { + cfg, err := flags.ToRESTConfig() + if err != nil { + return nil, fmt.Errorf("load kubeconfig: %w", err) + } + + dyn, err := dynamic.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("dynamic client: %w", err) + } + + core, err := kubernetes.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("kubernetes client: %w", err) + } + + disc, err := flags.ToDiscoveryClient() + if err != nil { + return nil, fmt.Errorf("discovery client: %w", err) + } + mapper := restmapper.NewDeferredDiscoveryRESTMapper(disc) + + ns, _, err := flags.ToRawKubeConfigLoader().Namespace() + if err != nil { + return nil, fmt.Errorf("resolve namespace: %w", err) + } + + return &Client{ + config: cfg, + dynamic: dyn, + core: core, + mapper: mapper, + namespace: ns, + }, nil +} + +// Dynamic returns the dynamic client used to read arbitrary CRD objects. +func (c *Client) Dynamic() dynamic.Interface { return c.dynamic } + +// Core returns the typed kubernetes client used to list pods. +func (c *Client) Core() kubernetes.Interface { return c.core } + +// Mapper returns the REST mapper used to resolve GVK ↔ GVR. +func (c *Client) Mapper() *restmapper.DeferredDiscoveryRESTMapper { return c.mapper } + +// Namespace returns the namespace resolved from --namespace or the current context. +func (c *Client) Namespace() string { return c.namespace } diff --git a/cmd/karta/internal/loader/workload.go b/cmd/karta/internal/loader/workload.go new file mode 100644 index 0000000..af86971 --- /dev/null +++ b/cmd/karta/internal/loader/workload.go @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +// Package loader resolves a workload object and its candidate pods from the +// live cluster, given a name and namespace, so the tree builder can run. +// +// The loader is deliberately Karta-aware: it iterates the registry of known +// community definitions, lists each definition's target GVK in the namespace, +// and matches by name. This means the user can run `karta workload tree +// my-job` without having to remember whether it's a PyTorchJob or a +// RayCluster — the CLI figures it out. +package loader + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + + "github.com/run-ai/karta/cmd/karta/internal/definitions" + "github.com/run-ai/karta/cmd/karta/internal/kube" + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" +) + +// Resolved is the input set the tree builder needs. +type Resolved struct { + Karta *v1alpha1.Karta + Workload *unstructured.Unstructured + Pods []corev1.Pod +} + +// Discovered is one Karta-aware workload found while walking a namespace. +// It carries everything needed to assemble the workload's tree without a +// second cluster round-trip per workload. +type Discovered struct { + Karta *v1alpha1.Karta + Workload *unstructured.Unstructured +} + +// FindWorkload locates a workload by name across all GVKs the registry knows +// about. It errors if the name is ambiguous across multiple kinds in the +// namespace, or if no Karta-aware workload matches. +func FindWorkload(ctx context.Context, client *kube.Client, registry *definitions.Registry, namespace, name string) (*Resolved, error) { + type hit struct { + gvk schema.GroupVersionKind + obj *unstructured.Unstructured + karta *v1alpha1.Karta + } + + var hits []hit + + for _, k := range registry.All() { + root := k.Spec.StructureDefinition.RootComponent + if root.Kind == nil { + continue + } + gvk := schema.GroupVersionKind{Group: root.Kind.Group, Version: root.Kind.Version, Kind: root.Kind.Kind} + mapping, err := client.Mapper().RESTMapping(gvk.GroupKind(), gvk.Version) + if err != nil { + // Kind not installed in cluster; skip this definition silently. + continue + } + obj, err := client.Dynamic().Resource(mapping.Resource).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + continue + } + return nil, fmt.Errorf("get %s/%s: %w", gvk.Kind, name, err) + } + hits = append(hits, hit{gvk: gvk, obj: obj, karta: k}) + } + + switch len(hits) { + case 0: + return nil, fmt.Errorf("no Karta-aware workload named %q in namespace %q (try kubectl get to list raw resources)", name, namespace) + case 1: + // fall through + default: + kinds := make([]string, 0, len(hits)) + for _, h := range hits { + kinds = append(kinds, h.gvk.Kind) + } + return nil, fmt.Errorf("workload name %q is ambiguous across kinds: %v — disambiguate by deleting one or specifying kind explicitly (not yet supported)", name, kinds) + } + + h := hits[0] + pods, err := client.Core().CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("list pods in %q: %w", namespace, err) + } + + return &Resolved{Karta: h.karta, Workload: h.obj, Pods: pods.Items}, nil +} + +// ListWorkloads returns every Karta-aware workload in the namespace, plus +// the namespace's full pod list (so the caller can build trees per workload +// without re-listing pods). GVKs whose CRDs aren't installed in the cluster +// are silently skipped, mirroring FindWorkload. +func ListWorkloads(ctx context.Context, client *kube.Client, registry *definitions.Registry, namespace string) ([]Discovered, []corev1.Pod, error) { + var found []Discovered + + for _, k := range registry.All() { + root := k.Spec.StructureDefinition.RootComponent + if root.Kind == nil { + continue + } + gvk := schema.GroupVersionKind{Group: root.Kind.Group, Version: root.Kind.Version, Kind: root.Kind.Kind} + mapping, err := client.Mapper().RESTMapping(gvk.GroupKind(), gvk.Version) + if err != nil { + continue + } + list, err := client.Dynamic().Resource(mapping.Resource).Namespace(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + continue + } + return nil, nil, fmt.Errorf("list %s in %q: %w", gvk.Kind, namespace, err) + } + for i := range list.Items { + obj := list.Items[i] + found = append(found, Discovered{Karta: k, Workload: &obj}) + } + } + + pods, err := client.Core().CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, nil, fmt.Errorf("list pods in %q: %w", namespace, err) + } + + return found, pods.Items, nil +} diff --git a/cmd/karta/internal/render/style.go b/cmd/karta/internal/render/style.go new file mode 100644 index 0000000..4b11b7c --- /dev/null +++ b/cmd/karta/internal/render/style.go @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package render + +import ( + "io" + "os" + "strings" + + "golang.org/x/term" +) + +// Style holds the palette used by the renderers. A zero Style emits plain +// text — useful for tests, pipes, and NO_COLOR environments. +type Style struct { + enabled bool +} + +// AutoStyle returns a Style with colors enabled if w is a terminal and the +// NO_COLOR convention isn't set. Anything else returns a no-op Style. +func AutoStyle(w io.Writer) Style { + if _, ok := os.LookupEnv("NO_COLOR"); ok { + return Style{} + } + f, ok := w.(*os.File) + if !ok { + return Style{} + } + if !term.IsTerminal(int(f.Fd())) { + return Style{} + } + return Style{enabled: true} +} + +// PlainStyle is a no-op style — every wrapper returns its input unchanged. +func PlainStyle() Style { return Style{} } + +// ForceStyle returns a Style with colors enabled regardless of TTY status — +// useful when the user passes --color=always. +func ForceStyle() Style { return Style{enabled: true} } + +const ( + resetSeq = "\x1b[0m" + + bold = "\x1b[1m" + dim = "\x1b[2m" + red = "\x1b[31m" + green = "\x1b[32m" + yellow = "\x1b[33m" + blue = "\x1b[34m" + magenta = "\x1b[35m" + cyan = "\x1b[36m" + brightCyan = "\x1b[96m" +) + +func (s Style) wrap(seq, text string) string { + if !s.enabled || text == "" { + return text + } + return seq + text + resetSeq +} + +func (s Style) Bold(t string) string { return s.wrap(bold, t) } +func (s Style) Dim(t string) string { return s.wrap(dim, t) } +func (s Style) Red(t string) string { return s.wrap(red, t) } +func (s Style) Green(t string) string { return s.wrap(green, t) } +func (s Style) Yellow(t string) string { return s.wrap(yellow, t) } +func (s Style) Blue(t string) string { return s.wrap(blue, t) } +func (s Style) Magenta(t string) string { return s.wrap(magenta, t) } +func (s Style) Cyan(t string) string { return s.wrap(cyan, t) } + +// Header is the bold-cyan "Kind/Name" lead-in. +func (s Style) Header(t string) string { return s.wrap(bold+brightCyan, t) } + +// Phase colors a phase name by its semantics. Unrecognized phases render dim. +func (s Style) Phase(p string) string { + switch p { + case "Running", "Succeeded", "Completed": + return s.Green(p) + case "Initializing", "Pending", "Progressing": + return s.Yellow(p) + case "Failed", "Degraded": + return s.Red(p) + case "Undefined", "": + return s.Dim(p) + default: + return s.Bold(p) + } +} + +// Phases joins and colors a phase list. +func (s Style) Phases(ps []string) string { + if len(ps) == 0 { + return s.Dim("-") + } + parts := make([]string, len(ps)) + for i, p := range ps { + parts[i] = s.Phase(p) + } + return strings.Join(parts, ",") +} + +// Ratio colors a "got/want" pair: green when got >= want, yellow when partial, +// red when nothing is ready against a non-zero want. +func (s Style) Ratio(got, want int32, suffix string) string { + body := suffix + if body != "" { + body = " " + body + } + text := formatRatio(got, want) + body + switch { + case want == 0: + return s.Dim(text) + case got >= want: + return s.Green(text) + case got == 0: + return s.Red(text) + default: + return s.Yellow(text) + } +} + +func formatRatio(a, b int32) string { + return itoa(int(a)) + "/" + itoa(int(b)) +} + +func itoa(n int) string { + if n == 0 { + return "0" + } + neg := n < 0 + if neg { + n = -n + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + if neg { + i-- + buf[i] = '-' + } + return string(buf[i:]) +} diff --git a/cmd/karta/internal/render/table.go b/cmd/karta/internal/render/table.go new file mode 100644 index 0000000..57a6dd6 --- /dev/null +++ b/cmd/karta/internal/render/table.go @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package render + +import ( + "fmt" + "io" + "regexp" + "strings" + "time" + "unicode/utf8" +) + +// ansiCSIRe matches ANSI CSI escape sequences (SGR color codes etc.) so we +// can compute the visible width of a styled cell. Go's text/tabwriter does +// not exclude content inside its Escape brackets from width calculation, so +// we pad columns ourselves. +var ansiCSIRe = regexp.MustCompile(`\x1b\[[0-9;]*[A-Za-z]`) + +func visibleWidth(s string) int { + return utf8.RuneCountInString(ansiCSIRe.ReplaceAllString(s, "")) +} + +// ListRow is one row in the workload list table — one Karta-aware workload +// in a namespace, summarized for the operational overview. +type ListRow struct { + Namespace string + Name string + Kind string + Phases []string + Components []ComponentSummary + GPU int64 + Age time.Duration +} + +// ComponentSummary captures the per-component counts displayed in the +// COMPONENTS column of `karta workload list`. Format: "name(currentReplicas)". +type ComponentSummary struct { + Name string + CurrentReplicas int32 +} + +// List writes the workload list table to w. Columns are padded by visible +// width so ANSI color codes don't throw off alignment. +func List(w io.Writer, rows []ListRow, s Style) error { + headers := []string{"NAMESPACE", "NAME", "KIND", "PHASE", "COMPONENTS", "GPU", "AGE"} + cells := make([][]string, 0, len(rows)+1) + + headerRow := make([]string, len(headers)) + for i, h := range headers { + headerRow[i] = s.Bold(h) + } + cells = append(cells, headerRow) + + for _, r := range rows { + cells = append(cells, []string{ + r.Namespace, + s.Bold(r.Name), + s.Cyan(r.Kind), + s.Phases(r.Phases), + componentsColored(r.Components, s), + gpuTableCell(r.GPU, s), + s.Dim(formatAge(r.Age)), + }) + } + + widths := make([]int, len(headers)) + for _, row := range cells { + for i, c := range row { + if vw := visibleWidth(c); vw > widths[i] { + widths[i] = vw + } + } + } + + const interColPadding = 3 + var buf strings.Builder + for _, row := range cells { + for i, c := range row { + buf.WriteString(c) + if i < len(row)-1 { + buf.WriteString(strings.Repeat(" ", widths[i]-visibleWidth(c)+interColPadding)) + } + } + buf.WriteByte('\n') + } + _, err := io.WriteString(w, buf.String()) + return err +} + +func componentsColored(comps []ComponentSummary, s Style) string { + if len(comps) == 0 { + return s.Dim("-") + } + parts := make([]string, 0, len(comps)) + for _, c := range comps { + parts = append(parts, s.Cyan(c.Name)+s.Dim(fmt.Sprintf("(%d)", c.CurrentReplicas))) + } + return strings.Join(parts, s.Dim(", ")) +} + +func gpuTableCell(n int64, s Style) string { + if n == 0 { + return s.Dim("0") + } + return s.Bold(s.Magenta(fmt.Sprintf("%d", n))) +} + +func formatComponents(comps []ComponentSummary) string { + if len(comps) == 0 { + return "-" + } + parts := make([]string, 0, len(comps)) + for _, c := range comps { + parts = append(parts, fmt.Sprintf("%s(%d)", c.Name, c.CurrentReplicas)) + } + return strings.Join(parts, ", ") +} + +func formatAge(d time.Duration) string { + if d <= 0 { + return "-" + } + switch { + case d < time.Minute: + return fmt.Sprintf("%ds", int(d.Seconds())) + case d < time.Hour: + return fmt.Sprintf("%dm", int(d.Minutes())) + case d < 24*time.Hour: + return fmt.Sprintf("%dh", int(d.Hours())) + default: + return fmt.Sprintf("%dd", int(d.Hours()/24)) + } +} diff --git a/cmd/karta/internal/render/table_test.go b/cmd/karta/internal/render/table_test.go new file mode 100644 index 0000000..f8ae614 --- /dev/null +++ b/cmd/karta/internal/render/table_test.go @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package render + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestListRendering(t *testing.T) { + rows := []ListRow{ + { + Namespace: "ml", + Name: "llama-finetune", + Kind: "PyTorchJob", + Phases: []string{"Running"}, + Components: []ComponentSummary{ + {Name: "master", CurrentReplicas: 1}, + {Name: "worker", CurrentReplicas: 4}, + }, + GPU: 33, + Age: 2 * time.Hour, + }, + { + Namespace: "ml", + Name: "preprocess", + Kind: "JobSet", + Phases: []string{"Completed"}, + Components: []ComponentSummary{{Name: "etl", CurrentReplicas: 3}}, + GPU: 0, + Age: 45 * time.Minute, + }, + } + + var buf bytes.Buffer + if err := List(&buf, rows, PlainStyle()); err != nil { + t.Fatalf("List render: %v", err) + } + got := buf.String() + + wants := []string{ + "NAMESPACE", + "llama-finetune", + "PyTorchJob", + "master(1), worker(4)", + "33", + "2h", + "preprocess", + "etl(3)", + "45m", + "Completed", + } + for _, w := range wants { + if !strings.Contains(got, w) { + t.Errorf("expected output to contain %q, got:\n%s", w, got) + } + } +} + +func TestFormatAge(t *testing.T) { + cases := map[time.Duration]string{ + 0: "-", + 30 * time.Second: "30s", + 5 * time.Minute: "5m", + 3 * time.Hour: "3h", + 48 * time.Hour: "2d", + } + for d, want := range cases { + if got := formatAge(d); got != want { + t.Errorf("formatAge(%v): got %q want %q", d, got, want) + } + } +} diff --git a/cmd/karta/internal/render/tree.go b/cmd/karta/internal/render/tree.go new file mode 100644 index 0000000..d20d1c3 --- /dev/null +++ b/cmd/karta/internal/render/tree.go @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package render + +import ( + "fmt" + "io" + "strings" + "unicode/utf8" +) + +// visibleLen returns the number of visible terminal columns a string takes, +// approximated as the rune count. For ASCII this equals byte length; for the +// box-drawing glyphs in tree branches (│ ├ └ ─) it correctly returns 1 per +// glyph instead of the 3 bytes their UTF-8 encoding occupies. +func visibleLen(s string) int { return utf8.RuneCountInString(s) } + +// columnGap is the number of spaces inserted between aligned columns. +// Two spaces is the kubectl convention; tight enough to feel dense, +// wide enough to read. +const columnGap = " " + +// Tree writes a styled ASCII workload tree to w. Pass PlainStyle() (or use +// AutoStyle on os.Stdout) to control whether ANSI color is emitted. +// +// Layout strategy: +// - Per-sibling-group widths for name / replicas / ready / podname / phase +// so siblings align without trailing whitespace bleeding across the tree +// when names vary in length between branches. +// - Global widths for the GPU and nodes/node columns so those right-side +// fields land at the same X coordinate on every row, including across +// component and pod rows. The renderer pads the leading section of each +// row to a globally-computed target before writing gpu. +// +// Padding is computed from plain-text widths and applied as trailing +// spaces, so ANSI escape sequences inside styled segments don't affect +// the alignment math. +func Tree(w io.Writer, view WorkloadView, s Style) error { + header := fmt.Sprintf("%s/%s", view.Kind, view.Name) + fmt.Fprintf(w, "%s [%s]\n", s.Header(header), s.Phases(view.Phases)) + + layout := computeLayout(view.Components, "") + widths := computeComponentWidths(view.Components) + for i, c := range view.Components { + writeComponentAt(w, c, "", i == len(view.Components)-1, widths, layout, s) + } + return nil +} + +// treeLayout captures the global X-coordinate targets that gpu and the +// trailing column should land on. maxLeading is the visible width from +// column 0 up to (but not including) the gpu cell; maxGPU is the gpu cell +// width itself, so the trailing nodes/node column lands at the same X for +// every row. +type treeLayout struct { + maxLeading int + maxGPU int +} + +// computeLayout walks the tree once and returns the global X-coordinate +// targets needed to align the gpu and trailing columns across both +// component and pod rows. +func computeLayout(comps []ComponentView, parentPrefix string) treeLayout { + var l treeLayout + if len(comps) == 0 { + return l + } + cw := computeComponentWidths(comps) + + for i, c := range comps { + branch := "├─" + childPrefix := parentPrefix + "│ " + if i == len(comps)-1 { + branch = "└─" + childPrefix = parentPrefix + " " + } + + compLead := visibleLen(parentPrefix) + visibleLen(branch) + 1 + + cw.name + visibleLen(columnGap) + + cw.replicas + visibleLen(columnGap) + + cw.ready + if compLead > l.maxLeading { + l.maxLeading = compLead + } + if cw.gpu > l.maxGPU { + l.maxGPU = cw.gpu + } + + child := computeLayout(c.Children, childPrefix) + if child.maxLeading > l.maxLeading { + l.maxLeading = child.maxLeading + } + if child.maxGPU > l.maxGPU { + l.maxGPU = child.maxGPU + } + + if len(c.Pods) > 0 { + pw := computePodWidths(c.Pods) + for j := range c.Pods { + podBranch := "├─" + if j == len(c.Pods)-1 { + podBranch = "└─" + } + podLead := visibleLen(childPrefix) + visibleLen(podBranch) + 1 + + pw.name + visibleLen(columnGap) + pw.phase + if podLead > l.maxLeading { + l.maxLeading = podLead + } + } + if pw.gpu > l.maxGPU { + l.maxGPU = pw.gpu + } + } + } + return l +} + +// componentColWidths captures the maximum plain-text length of each +// component-row column across a sibling set. +type componentColWidths struct { + name, replicas, ready, gpu int +} + +// podColWidths captures the maximum plain-text length of each pod-row +// column across a sibling set. +type podColWidths struct { + name, phase, gpu int +} + +func writeComponentAt(w io.Writer, c ComponentView, parentPrefix string, isLast bool, widths componentColWidths, layout treeLayout, s Style) { + branch := "├─" + childPrefix := parentPrefix + "│ " + if isLast { + branch = "└─" + childPrefix = parentPrefix + " " + } + + namePlain, repPlain, readyPlain, gpuPlain := componentFields(c) + nameStyled := padTo(s.Bold(s.Cyan(c.Name)), len(namePlain), widths.name) + repStyled := padTo("("+s.Ratio(c.CurrentReplicas, c.DesiredReplicas, "replicas")+")", len(repPlain), widths.replicas) + readyStyled := padTo(s.Ratio(c.ReadyCount, c.CurrentReplicas, "ready"), len(readyPlain), widths.ready) + gpuStyled := padTo(gpuLabel(c.GPUs, s), len(gpuPlain), layout.maxGPU) + + leadingPlain := visibleLen(parentPrefix) + visibleLen(branch) + 1 + + widths.name + visibleLen(columnGap) + + widths.replicas + visibleLen(columnGap) + + widths.ready + leadingPad := layout.maxLeading - leadingPlain + if leadingPad < 0 { + leadingPad = 0 + } + + fmt.Fprintf(w, "%s%s %s%s%s%s%s%s%s%s%s\n", + s.Dim(parentPrefix), + s.Dim(branch), + nameStyled, columnGap, + repStyled, columnGap, + readyStyled, + strings.Repeat(" ", leadingPad)+columnGap, + gpuStyled, columnGap, + nodeListDim(c.Nodes, s), + ) + + if len(c.Children) > 0 { + childWidths := computeComponentWidths(c.Children) + for j, child := range c.Children { + writeComponentAt(w, child, childPrefix, j == len(c.Children)-1, childWidths, layout, s) + } + } + + if len(c.Pods) > 0 { + podWidths := computePodWidths(c.Pods) + for j, p := range c.Pods { + writePod(w, p, childPrefix, j == len(c.Pods)-1, podWidths, layout, s) + } + } +} + +func writePod(w io.Writer, p PodView, parentPrefix string, isLast bool, widths podColWidths, layout treeLayout, s Style) { + branch := "├─" + if isLast { + branch = "└─" + } + namePlain, phasePlain, gpuPlain := podFields(p) + nameStyled := padTo(s.Dim("Pod/")+p.Name, len(namePlain), widths.name) + phaseStyled := padTo(s.Phase(p.Phase), len(phasePlain), widths.phase) + gpuStyled := padTo(gpuLabel(p.GPUs, s), len(gpuPlain), layout.maxGPU) + + node := p.Node + if node == "" { + node = "" + } + nodeStyled := s.Dim(node) + + leadingPlain := visibleLen(parentPrefix) + visibleLen(branch) + 1 + + widths.name + visibleLen(columnGap) + widths.phase + leadingPad := layout.maxLeading - leadingPlain + if leadingPad < 0 { + leadingPad = 0 + } + + fmt.Fprintf(w, "%s%s %s%s%s%s%s%s%s\n", + s.Dim(parentPrefix), + s.Dim(branch), + nameStyled, columnGap, + phaseStyled, + strings.Repeat(" ", leadingPad)+columnGap, + gpuStyled, columnGap, + nodeStyled, + ) +} + +func componentFields(c ComponentView) (name, replicas, ready, gpu string) { + name = c.Name + replicas = fmt.Sprintf("(%d/%d replicas)", c.CurrentReplicas, c.DesiredReplicas) + ready = fmt.Sprintf("%d/%d ready", c.ReadyCount, c.CurrentReplicas) + gpu = fmt.Sprintf("gpu: %d", c.GPUs) + return +} + +func podFields(p PodView) (name, phase, gpu string) { + name = "Pod/" + p.Name + phase = p.Phase + gpu = fmt.Sprintf("gpu: %d", p.GPUs) + return +} + +func computeComponentWidths(comps []ComponentView) componentColWidths { + var cw componentColWidths + for _, c := range comps { + n, r, rd, g := componentFields(c) + cw.name = maxInt(cw.name, len(n)) + cw.replicas = maxInt(cw.replicas, len(r)) + cw.ready = maxInt(cw.ready, len(rd)) + cw.gpu = maxInt(cw.gpu, len(g)) + } + return cw +} + +func computePodWidths(pods []PodView) podColWidths { + var pw podColWidths + for _, p := range pods { + n, ph, g := podFields(p) + pw.name = maxInt(pw.name, len(n)) + pw.phase = maxInt(pw.phase, len(ph)) + pw.gpu = maxInt(pw.gpu, len(g)) + } + return pw +} + +// padTo right-pads styled with spaces so its visible width reaches w. +// plainLen is the visible length of styled (i.e. excluding ANSI escapes). +func padTo(styled string, plainLen, w int) string { + if w <= plainLen { + return styled + } + return styled + strings.Repeat(" ", w-plainLen) +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + +func gpuLabel(n int64, s Style) string { + if n == 0 { + return s.Dim("gpu: 0") + } + return s.Dim("gpu: ") + s.Bold(s.Magenta(itoa(int(n)))) +} + +func nodeListDim(ns []string, s Style) string { + if len(ns) == 0 { + return s.Dim("") + } + return s.Dim(strings.Join(ns, ",")) +} diff --git a/cmd/karta/internal/render/tree_test.go b/cmd/karta/internal/render/tree_test.go new file mode 100644 index 0000000..3cd7437 --- /dev/null +++ b/cmd/karta/internal/render/tree_test.go @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package render + +import ( + "bytes" + "strings" + "testing" +) + +func TestTreeRendering(t *testing.T) { + view := WorkloadView{ + Kind: "PyTorchJob", + Name: "demo", + Phases: []string{"Running"}, + Components: []ComponentView{ + { + Name: "master", + DesiredReplicas: 1, + CurrentReplicas: 1, + ReadyCount: 1, + GPUs: 1, + Nodes: []string{"node-01"}, + Pods: []PodView{ + {Name: "demo-master-0", Phase: "Running", Ready: true, Node: "node-01", GPUs: 1}, + }, + }, + { + Name: "worker", + DesiredReplicas: 4, + CurrentReplicas: 4, + ReadyCount: 3, + GPUs: 32, + Nodes: []string{"node-02", "node-03", "node-04"}, + Pods: []PodView{ + {Name: "demo-worker-0", Phase: "Running", Ready: true, Node: "node-02", GPUs: 8}, + {Name: "demo-worker-1", Phase: "Running", Ready: true, Node: "node-03", GPUs: 8}, + {Name: "demo-worker-2", Phase: "Running", Ready: true, Node: "node-04", GPUs: 8}, + {Name: "demo-worker-3", Phase: "Pending", Node: "", GPUs: 8}, + }, + }, + }, + } + + var buf bytes.Buffer + if err := Tree(&buf, view, PlainStyle()); err != nil { + t.Fatalf("render: %v", err) + } + got := buf.String() + + wants := []string{ + "PyTorchJob/demo [Running]", + // Bare hostnames in the trailing column (no "nodes:" label) + "├─ master (1/1 replicas) 1/1 ready gpu: 1 node-01", + "└─ worker (4/4 replicas) 3/4 ready gpu: 32 node-02,node-03,node-04", + "│ └─ Pod/demo-master-0 Running gpu: 1 node-01", + " └─ Pod/demo-worker-3 Pending gpu: 8 ", + } + for _, w := range wants { + if !strings.Contains(got, w) { + t.Errorf("expected output to contain %q, got:\n%s", w, got) + } + } +} + +func TestPhasesString(t *testing.T) { + if got := PhasesString(nil); got != "-" { + t.Errorf("empty phases want \"-\", got %q", got) + } + if got := PhasesString([]string{"Running"}); got != "Running" { + t.Errorf("single phase want \"Running\", got %q", got) + } + if got := PhasesString([]string{"Running", "Degraded"}); got != "Running,Degraded" { + t.Errorf("multi phase want \"Running,Degraded\", got %q", got) + } +} + +func TestFormatNodes(t *testing.T) { + if got := FormatNodes(nil); got != "" { + t.Errorf("empty nodes want , got %q", got) + } + if got := FormatNodes([]string{"a", "b"}); got != "a,b" { + t.Errorf("nodes want a,b, got %q", got) + } +} diff --git a/cmd/karta/internal/render/view.go b/cmd/karta/internal/render/view.go new file mode 100644 index 0000000..a4a1a46 --- /dev/null +++ b/cmd/karta/internal/render/view.go @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +// Package render computes the CLI display layer on top of pkg/tree's raw +// WorkloadTree. The split mirrors the HLD: pkg/tree is the shared data model +// every consumer can use; the display fields (ready counts, GPU sums, +// rendered tree text) are derived here on traversal. +package render + +import ( + "fmt" + "sort" + "strings" + + corev1 "k8s.io/api/core/v1" + + "github.com/run-ai/karta/pkg/tree" +) + +// WorkloadView is the display-shaped projection of a WorkloadTree, with all +// the aggregate fields the CLI needs to render. +type WorkloadView struct { + Kind string + Name string + Namespace string + Phases []string + Components []ComponentView +} + +// ComponentView holds the rendered fields for a tree component. +type ComponentView struct { + Name string + DesiredReplicas int32 + CurrentReplicas int32 + ReadyCount int32 + GPUs int64 + Nodes []string + Pods []PodView + Children []ComponentView +} + +// PodView holds the rendered fields for a single pod under a component. +type PodView struct { + Name string + Phase string + Ready bool + Node string + GPUs int64 +} + +// Build computes a WorkloadView from a raw WorkloadTree and the kind / name / +// namespace pulled off the workload object. +func Build(t *tree.WorkloadTree, kind, name, namespace string) WorkloadView { + wv := WorkloadView{ + Kind: kind, + Name: name, + Namespace: namespace, + Phases: append([]string(nil), t.Status.Phases...), + } + for _, c := range t.Children { + wv.Components = append(wv.Components, buildComponent(c)) + } + return wv +} + +func buildComponent(c tree.ComponentNode) ComponentView { + // Multi-instance components (Dynamo's "service" split into Frontend / + // PrefillWorker / DecodeWorker) render as a parent with one synthetic + // child per instance, so each instance gets its own replica count, GPU + // roll-up, and pod list — matching the HLD's example output. + if isMultiInstance(c) { + return buildMultiInstanceComponent(c) + } + + cv := ComponentView{Name: c.Name} + nodeSet := map[string]struct{}{} + + var podsAll []*corev1.Pod + for _, inst := range c.Instances { + podsAll = append(podsAll, inst.Pods...) + if inst.Scale != nil && inst.Scale.Replicas != nil { + cv.DesiredReplicas += *inst.Scale.Replicas + } + for _, child := range inst.Children { + cv.Children = append(cv.Children, buildComponent(child)) + } + } + for _, p := range podsAll { + cv.CurrentReplicas++ + pv := podView(p) + cv.GPUs += pv.GPUs + if pv.Ready { + cv.ReadyCount++ + } + if pv.Node != "" { + nodeSet[pv.Node] = struct{}{} + } + cv.Pods = append(cv.Pods, pv) + } + + if cv.DesiredReplicas == 0 { + cv.DesiredReplicas = cv.CurrentReplicas + } + + for n := range nodeSet { + cv.Nodes = append(cv.Nodes, n) + } + sort.Strings(cv.Nodes) + sort.SliceStable(cv.Pods, func(i, j int) bool { return cv.Pods[i].Name < cv.Pods[j].Name }) + + // When children exist they own pod-level rendering; clear the per-component + // Pods list so we don't double-render under a parent that's just grouping. + if len(cv.Children) > 0 { + cv.Pods = nil + } + + return cv +} + +// isMultiInstance reports whether a ComponentNode carries the split shape: +// more than one InstanceNode, with at least one InstanceKey or ReplicaKey +// set. Both shapes render as a parent + per-key sub-components. +func isMultiInstance(c tree.ComponentNode) bool { + if len(c.Instances) <= 1 { + return false + } + for _, inst := range c.Instances { + if inst.InstanceKey != nil || inst.ReplicaKey != nil { + return true + } + } + return false +} + +// buildMultiInstanceComponent flattens the InstanceNodes of a split +// component into per-key ComponentViews, each rendered as if it were its +// own component. Counts, GPU sums, and node lists roll up to the parent +// so the table view sees an aggregate. +// +// Naming convention: instance-keyed splits (Dynamo Frontend / PrefillWorker) +// use the bare instance name; replica-keyed splits (LWS group-0 / group-1) +// use "[]" so the wrapper is obvious. +func buildMultiInstanceComponent(c tree.ComponentNode) ComponentView { + parent := ComponentView{Name: c.Name} + parentNodes := map[string]struct{}{} + + // Count how many replica wrappers we have; descendants' Karta-extracted + // DesiredReplicas counts are workload-global (`.spec.replicas // 1` + // returns LWS-level replicas, not per-replica), so we divide each + // grand-child's DesiredReplicas by this count when rendering inside a + // replica wrapper. + replicaCount := int32(0) + for _, inst := range c.Instances { + if inst.ReplicaKey != nil { + replicaCount++ + } + } + + for _, inst := range c.Instances { + var label string + isReplica := false + switch { + case inst.InstanceKey != nil: + label = *inst.InstanceKey + case inst.ReplicaKey != nil: + label = c.Name + "[" + *inst.ReplicaKey + "]" + isReplica = true + default: + continue + } + + child := ComponentView{Name: label} + childNodes := map[string]struct{}{} + + // Recurse into nested children (e.g. leader/worker under each LWS replica). + for _, gc := range inst.Children { + gcView := buildComponent(gc) + if isReplica && replicaCount > 0 { + gcView.DesiredReplicas = perReplicaDesired(gcView.DesiredReplicas, replicaCount) + } + child.Children = append(child.Children, gcView) + } + + // Counting: when this instance has children, the children own the + // pod ledger (their CurrentReplicas already covers every pod in the + // subtree). Counting inst.Pods on top would double-count, since + // inst.Pods carries the same set after the descendant-narrowing fix. + // Only when there are no children do we count inst.Pods directly. + if len(child.Children) == 0 { + if inst.Scale != nil && inst.Scale.Replicas != nil { + child.DesiredReplicas = *inst.Scale.Replicas + } + for _, p := range inst.Pods { + child.CurrentReplicas++ + pv := podView(p) + child.GPUs += pv.GPUs + if pv.Ready { + child.ReadyCount++ + } + if pv.Node != "" { + childNodes[pv.Node] = struct{}{} + parentNodes[pv.Node] = struct{}{} + } + child.Pods = append(child.Pods, pv) + } + } else { + for _, gc := range child.Children { + child.GPUs += gc.GPUs + child.ReadyCount += gc.ReadyCount + child.CurrentReplicas += gc.CurrentReplicas + child.DesiredReplicas += gc.DesiredReplicas + for _, n := range gc.Nodes { + if _, seen := childNodes[n]; !seen { + childNodes[n] = struct{}{} + parentNodes[n] = struct{}{} + } + } + } + } + + if child.DesiredReplicas == 0 { + child.DesiredReplicas = child.CurrentReplicas + } + for n := range childNodes { + child.Nodes = append(child.Nodes, n) + } + sort.Strings(child.Nodes) + sort.SliceStable(child.Pods, func(i, j int) bool { return child.Pods[i].Name < child.Pods[j].Name }) + + parent.CurrentReplicas += child.CurrentReplicas + parent.DesiredReplicas += child.DesiredReplicas + parent.ReadyCount += child.ReadyCount + parent.GPUs += child.GPUs + parent.Children = append(parent.Children, child) + } + for n := range parentNodes { + parent.Nodes = append(parent.Nodes, n) + } + sort.Strings(parent.Nodes) + return parent +} + +func podView(p *corev1.Pod) PodView { + v := PodView{ + Name: p.Name, + Phase: string(p.Status.Phase), + Node: p.Spec.NodeName, + Ready: isReady(p), + GPUs: gpuSum(p), + } + return v +} + +func isReady(p *corev1.Pod) bool { + for _, c := range p.Status.Conditions { + if c.Type == corev1.PodReady { + return c.Status == corev1.ConditionTrue + } + } + return false +} + +func gpuSum(p *corev1.Pod) int64 { + var total int64 + for _, c := range p.Spec.Containers { + if q, ok := c.Resources.Limits["nvidia.com/gpu"]; ok { + total += q.Value() + continue + } + if q, ok := c.Resources.Requests["nvidia.com/gpu"]; ok { + total += q.Value() + } + } + return total +} + +// PhasesString returns a single-line summary of phases, suitable for the +// header bracket. An empty phase set renders as "-". +func PhasesString(phases []string) string { + if len(phases) == 0 { + return "-" + } + return strings.Join(phases, ",") +} + +// FormatNodes is a comma-joiner that returns "" for empty input so +// pods without an assigned node render predictably. +func FormatNodes(ns []string) string { + if len(ns) == 0 { + return "" + } + return strings.Join(ns, ",") +} + +// nolint:unused // helper retained for future use when rendering rolls up +// per-instance pod details +func formatNumber(n int64) string { return fmt.Sprintf("%d", n) } + +// perReplicaDesired divides a workload-global desired count by the replica +// count so each replica wrapper shows its own slice. The Karta library +// can't currently scope JQ extraction per-replica, so we adjust at render +// time. We only divide when the global count is cleanly divisible — if +// it isn't, we keep the original value rather than round (better to +// over-display than to lie). +func perReplicaDesired(global, replicas int32) int32 { + if replicas <= 0 || global == 0 { + return global + } + if global%replicas != 0 { + return global + } + return global / replicas +} + +// SummarizeComponents flattens a WorkloadView's component tree to its +// leaf components — the ones that actually carry pods — and returns one +// ComponentSummary per distinct leaf name, summing CurrentReplicas across +// replicas that share that name (LWS group-0's leader and group-1's leader +// roll up to a single `leader(2)` entry). Logical grouping components are +// skipped; declaration order is preserved on first appearance. +func SummarizeComponents(view WorkloadView) []ComponentSummary { + type idx struct { + pos int + } + order := []string{} + totals := map[string]int32{} + var walk func(c ComponentView) + walk = func(c ComponentView) { + if len(c.Children) > 0 { + for _, ch := range c.Children { + walk(ch) + } + return + } + if _, seen := totals[c.Name]; !seen { + order = append(order, c.Name) + } + totals[c.Name] += c.CurrentReplicas + } + for _, c := range view.Components { + walk(c) + } + out := make([]ComponentSummary, 0, len(order)) + for _, name := range order { + out = append(out, ComponentSummary{Name: name, CurrentReplicas: totals[name]}) + } + return out +} + +// TotalGPUs returns the sum of GPUs across the leaf components of a +// WorkloadView. Counting at leaves avoids double-counting when a parent +// aggregator already rolls its children up. +func TotalGPUs(view WorkloadView) int64 { + var total int64 + for _, c := range view.Components { + total += leafGPUSum(c) + } + return total +} + +func leafGPUSum(c ComponentView) int64 { + if len(c.Children) > 0 { + var sum int64 + for _, ch := range c.Children { + sum += leafGPUSum(ch) + } + return sum + } + return c.GPUs +} diff --git a/cmd/karta/main.go b/cmd/karta/main.go new file mode 100644 index 0000000..d8b1000 --- /dev/null +++ b/cmd/karta/main.go @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package main + +import ( + "os" + + "github.com/run-ai/karta/cmd/karta/cmd" +) + +func main() { + if err := cmd.NewRootCmd().Execute(); err != nil { + os.Exit(1) + } +} diff --git a/go.mod b/go.mod index 62a347d..bb45d09 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( k8s.io/api v0.35.1 k8s.io/apimachinery v0.35.1 k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 + sigs.k8s.io/yaml v1.6.0 ) require ( @@ -34,12 +35,10 @@ require ( golang.org/x/sys v0.41.0 // indirect golang.org/x/text v0.34.0 // indirect golang.org/x/tools v0.42.0 // indirect - google.golang.org/protobuf v1.36.11 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20260127142750-a19766b6e2d4 // indirect + k8s.io/klog/v2 v2.140.0 // indirect + k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect - sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index bf3234e..5326e27 100644 --- a/go.sum +++ b/go.sum @@ -88,8 +88,8 @@ golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= -google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= -google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -101,10 +101,10 @@ k8s.io/api v0.35.1 h1:0PO/1FhlK/EQNVK5+txc4FuhQibV25VLSdLMmGpDE/Q= k8s.io/api v0.35.1/go.mod h1:28uR9xlXWml9eT0uaGo6y71xK86JBELShLy4wR1XtxM= k8s.io/apimachinery v0.35.1 h1:yxO6gV555P1YV0SANtnTjXYfiivaTPvCTKX6w6qdDsU= k8s.io/apimachinery v0.35.1/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= -k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= -k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20260127142750-a19766b6e2d4 h1:HhDfevmPS+OalTjQRKbTHppRIz01AWi8s45TMXStgYY= -k8s.io/kube-openapi v0.0.0-20260127142750-a19766b6e2d4/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= +k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= diff --git a/pkg/tree/builder.go b/pkg/tree/builder.go new file mode 100644 index 0000000..f09d97e --- /dev/null +++ b/pkg/tree/builder.go @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package tree + +import ( + "context" + "fmt" + "sort" + + corev1 "k8s.io/api/core/v1" + + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" + "github.com/run-ai/karta/pkg/jq/execution" + "github.com/run-ai/karta/pkg/resource" +) + +// Build constructs a WorkloadTree by walking the Karta definition's +// component hierarchy against a workload object and a list of candidate pods. +// +// The walk is top-down: the root component sets the workload status; each +// direct child of the root becomes a top-level ComponentNode in the tree. +// At each component the matcher is asked which pods belong to it, and the +// claimed pods are attached to the corresponding instance. +// +// Scope note (PoC): single-instance components are fully supported. Multi- +// instance (ComponentInstanceSelector) and ReplicaSelector flows are wired +// through but produce a single InstanceNode per component for now; richer +// instance splitting is a follow-up. +func Build(ctx context.Context, karta *v1alpha1.Karta, workload resource.KubernetesObject, pods []corev1.Pod, matcher PodMatcher) (*WorkloadTree, error) { + if karta == nil { + return nil, fmt.Errorf("karta definition must not be nil") + } + if workload == nil { + return nil, fmt.Errorf("workload object must not be nil") + } + if matcher == nil { + matcher = JQMatcher{} + } + + runner := execution.NewDefaultRunner(workload) + accessor := resource.NewAccessor(runner) + factory := resource.NewComponentFactory(karta, accessor) + + root, err := factory.GetRootComponent() + if err != nil { + return nil, fmt.Errorf("get root component: %w", err) + } + + tree := &WorkloadTree{} + + if status, err := root.GetStatus(ctx); err == nil && status != nil { + tree.Status = workloadStatusFromResource(status) + } + + rootDef := root.Definition() + rootChildren, err := childDefinitionsOf(karta, rootDef.Name) + if err != nil { + return nil, err + } + + for _, childDef := range rootChildren { + comp, err := factory.GetComponent(childDef.Name) + if err != nil { + return nil, fmt.Errorf("get component %q: %w", childDef.Name, err) + } + node, err := buildComponentNode(ctx, factory, karta, comp, childDef, pods, matcher) + if err != nil { + return nil, err + } + tree.Children = append(tree.Children, node) + } + + return tree, nil +} + +// buildComponentNode produces a ComponentNode for a single component +// definition, attaching its extracted instances, claiming pods the matcher +// associates with this component, and recursing into any child components +// the Karta definition declares under this component. +// +// Three instance-shaping paths exist, evaluated in order: +// +// 1. ReplicaSelector — split pods into one InstanceNode per replica index +// and recurse children per-replica so descendants stay replica-scoped +// (LWS `group` does this). +// 2. ComponentInstanceSelector — one InstanceNode per extracted instance, +// routing pods by the selector's idPath (Dynamo `service` does this). +// 3. Single instance — attach every matched pod and the (shared) recursed +// children to one InstanceNode. +func buildComponentNode(ctx context.Context, factory *resource.ComponentFactory, karta *v1alpha1.Karta, comp *resource.Component, def v1alpha1.ComponentDefinition, pods []corev1.Pod, matcher PodMatcher) (ComponentNode, error) { + node := ComponentNode{ + Name: def.Name, + Kind: def.Kind, + } + + matched := make([]*corev1.Pod, 0, len(pods)) + for i := range pods { + ok, err := matcher.Matches(ctx, &pods[i], &def) + if err != nil { + return ComponentNode{}, fmt.Errorf("match pod %q against component %q: %w", pods[i].Name, def.Name, err) + } + if ok { + matched = append(matched, &pods[i]) + } + } + + childDefs, err := childDefinitionsOf(karta, def.Name) + if err != nil { + return ComponentNode{}, err + } + + instances, err := extractedInstancesOrEmpty(ctx, comp) + if err != nil { + return ComponentNode{}, err + } + + // Path 1: ReplicaSelector splits the component into per-replica subtrees. + // Each replica's children are rebuilt against only that replica's pods, + // keeping leaf attribution scoped (LWS group-0's leader doesn't see + // group-1's pods). When componentInstanceSelector is also present (the + // Dynamo case where replicaSelector is wired through grove for future + // use), we prefer the instance split — replica-within-instance is a + // follow-up. + if repSel := replicaSelector(def); repSel != nil && componentInstanceSelector(def) == nil { + node.Instances, err = buildReplicaScoped(ctx, factory, karta, def, matched, childDefs, instances, repSel, matcher) + if err != nil { + return ComponentNode{}, err + } + return node, nil + } + + children, err := buildChildren(ctx, factory, karta, childDefs, matched, matcher) + if err != nil { + return ComponentNode{}, err + } + + // For non-leaf components with no componentTypeSelector, the matcher's + // permissive fallback would over-claim pods that belong to unrelated + // workloads sharing the same namespace. Re-narrow to the union of pods + // any descendant claimed: a parent only owns what its children own. + if len(children) > 0 && !hasComponentTypeSelector(def) { + matched = collectDescendantPods(children) + } + + if len(instances) == 0 { + node.Instances = []InstanceNode{{Pods: matched, Children: children}} + return node, nil + } + + // Path 2: ComponentInstanceSelector — one InstanceNode per extracted + // instance, routed by the selector's idPath. + if instSel := componentInstanceSelector(def); instSel != nil && len(instances) > 1 { + node.Instances, err = buildMultiInstance(ctx, instances, matched, instSel) + if err != nil { + return ComponentNode{}, err + } + return node, nil + } + + // Path 3: Single instance. + first := pickFirstInstance(instances) + scaleCopy := first.Scale + instCopy := first + node.Instances = []InstanceNode{{ + Scale: scaleCopy, + ExtractedInstance: &instCopy, + Pods: matched, + Children: children, + }} + + return node, nil +} + +// replicaSelector returns the selector responsible for splitting pods across +// replicas of a component, when the definition declares one. +func replicaSelector(def v1alpha1.ComponentDefinition) *v1alpha1.ReplicaSelector { + if def.PodSelector == nil { + return nil + } + return def.PodSelector.ReplicaSelector +} + +// buildReplicaScoped groups the parent's matched pods by replica key and +// rebuilds the children subtree per replica, so descendant pod-attribution +// stays scoped within each replica. The result is one InstanceNode per +// replica with ReplicaKey set, its own children, and any pods that didn't +// fall into a child. +func buildReplicaScoped(ctx context.Context, factory *resource.ComponentFactory, karta *v1alpha1.Karta, def v1alpha1.ComponentDefinition, matched []*corev1.Pod, childDefs []v1alpha1.ComponentDefinition, instances map[string]resource.ExtractedInstance, repSel *v1alpha1.ReplicaSelector, matcher PodMatcher) ([]InstanceNode, error) { + podsByReplica := make(map[string][]*corev1.Pod) + for _, p := range matched { + key, found, err := resource.NewPodQuerier(p).ExtractReplicaKey(ctx, repSel) + if err != nil || !found { + // Pods missing the replica-key label belong to no replica; skip. + continue + } + podsByReplica[key] = append(podsByReplica[key], p) + } + + keys := make([]string, 0, len(podsByReplica)) + for k := range podsByReplica { + keys = append(keys, k) + } + sort.Strings(keys) + + // Pick a single shared template instance (LWS group has one template + // regardless of replica count). When the Karta definition exposes + // per-replica instances by key, prefer those. + var sharedInst *resource.ExtractedInstance + if len(instances) == 1 { + for _, v := range instances { + vCopy := v + sharedInst = &vCopy + break + } + } + + out := make([]InstanceNode, 0, len(keys)) + for _, k := range keys { + replicaPods := podsByReplica[k] + children, err := buildChildren(ctx, factory, karta, childDefs, replicaPods, matcher) + if err != nil { + return nil, err + } + if len(children) > 0 && !hasComponentTypeSelector(def) { + replicaPods = collectDescendantPods(children) + } + + var inst *resource.ExtractedInstance + var scale *resource.Scale + if perKey, ok := instances[k]; ok { + perKeyCopy := perKey + inst = &perKeyCopy + scale = perKey.Scale + } else if sharedInst != nil { + inst = sharedInst + scale = sharedInst.Scale + } + + keyCopy := k + out = append(out, InstanceNode{ + ReplicaKey: &keyCopy, + Scale: scale, + ExtractedInstance: inst, + Pods: replicaPods, + Children: children, + }) + } + return out, nil +} + +// componentInstanceSelector returns the selector responsible for splitting +// pods across instances of a single component, when the definition declares +// one. +func componentInstanceSelector(def v1alpha1.ComponentDefinition) *v1alpha1.ComponentInstanceSelector { + if def.PodSelector == nil { + return nil + } + return def.PodSelector.ComponentInstanceSelector +} + +// buildMultiInstance produces one InstanceNode per extracted instance, +// routing pods by the result of the ComponentInstanceSelector. Pods whose +// instance ID doesn't match any extracted instance are dropped silently — +// they belong to a deleted or transitioning instance. +func buildMultiInstance(ctx context.Context, instances map[string]resource.ExtractedInstance, pods []*corev1.Pod, instSel *v1alpha1.ComponentInstanceSelector) ([]InstanceNode, error) { + podsByID := make(map[string][]*corev1.Pod, len(instances)) + for _, p := range pods { + id, found, err := resource.NewPodQuerier(p).ExtractInstanceId(ctx, instSel) + if err != nil || !found { + // Pods that don't carry the instance-id label belong to no + // extracted instance: skip silently. + continue + } + podsByID[id] = append(podsByID[id], p) + } + + keys := make([]string, 0, len(instances)) + for k := range instances { + keys = append(keys, k) + } + sort.Strings(keys) + + out := make([]InstanceNode, 0, len(keys)) + for _, k := range keys { + inst := instances[k] + instCopy := inst + keyCopy := k + out = append(out, InstanceNode{ + InstanceKey: &keyCopy, + Scale: inst.Scale, + ExtractedInstance: &instCopy, + Pods: podsByID[k], + }) + } + return out, nil +} + +// buildChildren recurses into the child definitions of a component, narrowing +// the candidate pod set so each child only sees pods its parent already +// claimed. +func buildChildren(ctx context.Context, factory *resource.ComponentFactory, karta *v1alpha1.Karta, childDefs []v1alpha1.ComponentDefinition, parentPods []*corev1.Pod, matcher PodMatcher) ([]ComponentNode, error) { + if len(childDefs) == 0 { + return nil, nil + } + candidatePods := make([]corev1.Pod, len(parentPods)) + for i, p := range parentPods { + candidatePods[i] = *p + } + out := make([]ComponentNode, 0, len(childDefs)) + for _, def := range childDefs { + comp, err := factory.GetComponent(def.Name) + if err != nil { + return nil, fmt.Errorf("get component %q: %w", def.Name, err) + } + child, err := buildComponentNode(ctx, factory, karta, comp, def, candidatePods, matcher) + if err != nil { + return nil, err + } + out = append(out, child) + } + return out, nil +} + +// childDefinitionsOf returns the direct children of the named component +// from the Karta structure definition, in declaration order. +func childDefinitionsOf(karta *v1alpha1.Karta, parentName string) ([]v1alpha1.ComponentDefinition, error) { + if karta == nil { + return nil, fmt.Errorf("karta definition must not be nil") + } + var children []v1alpha1.ComponentDefinition + for _, c := range karta.Spec.StructureDefinition.ChildComponents { + if c.OwnerRef != nil && *c.OwnerRef == parentName { + children = append(children, c) + } + } + return children, nil +} + +// extractedInstancesOrEmpty returns the extracted instances for a component, +// or an empty map when the component has no spec definition (in which case +// extraction is undefined and not an error). +func extractedInstancesOrEmpty(ctx context.Context, comp *resource.Component) (map[string]resource.ExtractedInstance, error) { + if !comp.HasPodDefinition() { + return nil, nil + } + return comp.GetExtractedInstances(ctx) +} + +func pickFirstInstance(instances map[string]resource.ExtractedInstance) resource.ExtractedInstance { + for _, v := range instances { + return v + } + return resource.ExtractedInstance{} +} + +// hasComponentTypeSelector reports whether the definition has an explicit +// pod-level discriminator. Components without one are treated as logical +// groupings whose pod set is the union of their descendants'. +func hasComponentTypeSelector(def v1alpha1.ComponentDefinition) bool { + return def.PodSelector != nil && def.PodSelector.ComponentTypeSelector != nil +} + +// collectDescendantPods returns every pod claimed anywhere in the subtree, +// deduplicated by pointer identity so a pod claimed by multiple descendants +// is counted once. +func collectDescendantPods(nodes []ComponentNode) []*corev1.Pod { + seen := make(map[*corev1.Pod]struct{}) + var out []*corev1.Pod + var walk func([]ComponentNode) + walk = func(ns []ComponentNode) { + for _, n := range ns { + for _, inst := range n.Instances { + for _, p := range inst.Pods { + if _, ok := seen[p]; ok { + continue + } + seen[p] = struct{}{} + out = append(out, p) + } + walk(inst.Children) + } + } + } + walk(nodes) + return out +} + +func workloadStatusFromResource(s *resource.Status) WorkloadStatus { + out := WorkloadStatus{} + for _, m := range s.MatchedStatuses { + out.Phases = append(out.Phases, string(m)) + } + return out +} diff --git a/pkg/tree/builder_test.go b/pkg/tree/builder_test.go new file mode 100644 index 0000000..02117f8 --- /dev/null +++ b/pkg/tree/builder_test.go @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package tree_test + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" + "github.com/run-ai/karta/pkg/tree" + "github.com/run-ai/karta/test/types" +) + +func newPod(name, role string) corev1.Pod { + return corev1.Pod{ + TypeMeta: metav1.TypeMeta{Kind: "Pod", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + Labels: map[string]string{"role": role}, + }, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } +} + +var _ = Describe("Build", func() { + var ( + ctx context.Context + karta *v1alpha1.Karta + matcher tree.PodMatcher + ) + + BeforeEach(func() { + ctx = context.Background() + karta = types.PyFlowKarta() + matcher = tree.JQMatcher{} + }) + + It("rejects a nil karta definition", func() { + _, err := tree.Build(ctx, nil, types.NewPyFlowObject(), nil, matcher) + Expect(err).To(HaveOccurred()) + }) + + It("rejects a nil workload", func() { + _, err := tree.Build(ctx, karta, nil, nil, matcher) + Expect(err).To(HaveOccurred()) + }) + + It("returns root-level child components for a single-instance workload", func() { + workload := types.NewPyFlowObject() + pods := []corev1.Pod{ + newPod("master-0", "master"), + newPod("worker-0", "worker"), + newPod("worker-1", "worker"), + } + + got, err := tree.Build(ctx, karta, workload, pods, matcher) + Expect(err).NotTo(HaveOccurred()) + Expect(got).NotTo(BeNil()) + Expect(got.Children).To(HaveLen(2)) + + byName := map[string]tree.ComponentNode{} + for _, c := range got.Children { + byName[c.Name] = c + } + + Expect(byName).To(HaveKey("master")) + Expect(byName).To(HaveKey("worker")) + + master := byName["master"] + Expect(master.Instances).To(HaveLen(1)) + Expect(master.Instances[0].Pods).To(HaveLen(1)) + Expect(master.Instances[0].Pods[0].Name).To(Equal("master-0")) + + worker := byName["worker"] + Expect(worker.Instances).To(HaveLen(1)) + Expect(worker.Instances[0].Pods).To(HaveLen(2)) + }) + + It("attaches no pods when none match the component selectors", func() { + workload := types.NewPyFlowObject() + pods := []corev1.Pod{newPod("orphan-0", "stranger")} + + got, err := tree.Build(ctx, karta, workload, pods, matcher) + Expect(err).NotTo(HaveOccurred()) + for _, comp := range got.Children { + for _, inst := range comp.Instances { + Expect(inst.Pods).To(BeEmpty()) + } + } + }) + + It("preserves the declared component name in tree order", func() { + workload := types.NewPyFlowObject() + got, err := tree.Build(ctx, karta, workload, nil, matcher) + Expect(err).NotTo(HaveOccurred()) + Expect(got.Children).To(HaveLen(2)) + Expect(got.Children[0].Name).To(Equal("master")) + Expect(got.Children[1].Name).To(Equal("worker")) + }) + + It("uses JQMatcher by default when matcher is nil", func() { + workload := types.NewPyFlowObject() + pods := []corev1.Pod{newPod("master-0", "master")} + + got, err := tree.Build(ctx, karta, workload, pods, nil) + Expect(err).NotTo(HaveOccurred()) + Expect(got).NotTo(BeNil()) + }) +}) diff --git a/pkg/tree/matcher.go b/pkg/tree/matcher.go new file mode 100644 index 0000000..b49357d --- /dev/null +++ b/pkg/tree/matcher.go @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package tree + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" + "github.com/run-ai/karta/pkg/resource" +) + +// PodMatcher decouples tree construction from the rule that decides which +// pods belong to which component. The builder walks the component hierarchy +// top-down and asks the matcher whether a given pod belongs to (or beneath) +// a given component; the matcher's strategy is opaque to the builder. +type PodMatcher interface { + // Matches reports whether pod belongs to component or any of its + // descendants. Returning true on a non-leaf component does not yet pin + // the pod to a specific child; the builder narrows the candidate set as + // it descends. + Matches(ctx context.Context, pod *corev1.Pod, component *v1alpha1.ComponentDefinition) (bool, error) +} + +// JQMatcher matches pods to components using the Karta definition's +// ComponentTypeSelector — a JQ path on the pod plus an optional expected +// value. This mirrors how the existing pkg/resource layer interprets pod +// selectors and is the right default for any Karta definition. +// +// Components with no PodSelector or no ComponentTypeSelector are treated as +// matching every pod, which matches the resource layer's behavior for +// container or grouping components. +type JQMatcher struct{} + +// Matches implements PodMatcher. +// +// Resolution order: +// 1. ComponentTypeSelector — explicit pod-type discriminator wins. +// 2. ComponentInstanceSelector — when present, a pod matches if the +// idPath returns a non-empty value (the multi-instance components, like +// Dynamo's `service`, identify their pods by the label the selector +// points at; absence of the label means the pod doesn't belong here). +// 3. Permissive fallback — return true. Logical groupings without any +// selector get pre-filtered later in the builder via the descendant +// union, so this fallback only over-claims at leaves with no +// discriminator at all (rare, and surfaces as a Karta-def issue). +func (JQMatcher) Matches(ctx context.Context, pod *corev1.Pod, component *v1alpha1.ComponentDefinition) (bool, error) { + if component == nil || component.PodSelector == nil { + return true, nil + } + sel := component.PodSelector + if sel.ComponentTypeSelector != nil { + return resource.NewPodQuerier(pod).MatchesComponentType(ctx, sel.ComponentTypeSelector) + } + if sel.ComponentInstanceSelector != nil { + _, found, err := resource.NewPodQuerier(pod).ExtractInstanceId(ctx, sel.ComponentInstanceSelector) + if err != nil { + // Empty-result errors here mean "this pod doesn't carry the + // instance-id label" — that's a non-match, not a failure. + return false, nil + } + return found, nil + } + return true, nil +} diff --git a/pkg/tree/suite_test.go b/pkg/tree/suite_test.go new file mode 100644 index 0000000..67a484e --- /dev/null +++ b/pkg/tree/suite_test.go @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +package tree_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestTree(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Tree Suite") +} diff --git a/pkg/tree/types.go b/pkg/tree/types.go new file mode 100644 index 0000000..3bdf931 --- /dev/null +++ b/pkg/tree/types.go @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (c) 2026 NVIDIA Corporation + +// Package tree builds and represents a workload as a hierarchical tree of +// components, instances, and pods. The tree is the shared data model the +// CLI, web, and MCP layers all consume so each consumer can render its own +// view without re-deriving structure from raw Karta extraction calls. +package tree + +import ( + corev1 "k8s.io/api/core/v1" + + "github.com/run-ai/karta/pkg/api/runai/v1alpha1" + "github.com/run-ai/karta/pkg/resource" +) + +// WorkloadTree is the raw workload tree produced by Build. It carries the +// structure, status, and matched pods extracted from a workload object via +// its Karta definition. Display-time concerns (ready counts, GPU sums, +// rendered tables) live above this layer in the consumer. +type WorkloadTree struct { + // Status is the normalized workload-level status, taken from the root + // component's Karta StatusDefinition. + Status WorkloadStatus + + // Children are the root-level components of the workload. + Children []ComponentNode +} + +// WorkloadStatus captures a workload's normalized phase set. A workload may +// match multiple phases simultaneously (for example "Running" + "Degraded"), +// which is why this is a slice rather than a single string. +type WorkloadStatus struct { + Phases []string +} + +// ComponentNode is one component in the workload tree. A component may have +// one or more instances; for non-multi-instance components there is exactly +// one InstanceNode whose InstanceKey is nil. +type ComponentNode struct { + // Name is the component's logical name as declared in the Karta definition. + Name string + + // Kind is the GroupVersionKind of the underlying Kubernetes object backing + // this component, when one exists. Logical-grouping components (no backing + // object) leave this nil. + Kind *v1alpha1.GroupVersionKind + + // Instances always has at least one entry. + Instances []InstanceNode +} + +// InstanceNode is one instance of a component. Multi-instance components +// (for example Dynamo's service component split into Frontend / PrefillWorker +// / DecodeWorker) carry one InstanceNode per instance, each with its own +// InstanceKey, scale, extracted spec, pods, and child components. +type InstanceNode struct { + // InstanceKey identifies this instance among siblings under the same + // component. Nil means the component is not multi-instance. + InstanceKey *string + + // ReplicaKey identifies this instance among ordered replicas (when the + // component supports replication beyond a simple replica count). Nil + // otherwise. + ReplicaKey *string + + // Scale carries the replicas / min / max extracted by the Karta definition. + Scale *resource.Scale + + // ExtractedInstance carries the pod spec, metadata, and scale that the + // Karta library extracted for this instance. + ExtractedInstance *resource.ExtractedInstance + + // Pods are the live pods claimed for this instance by the PodMatcher. + Pods []*corev1.Pod + + // Children are component nodes nested under this instance, when the + // Karta definition declares a deeper hierarchy. Empty for leaf components. + Children []ComponentNode +}