Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ REDIS_NO_SSL=false
FLASK_CONFIG=development
INSECURE_REQUESTS=false

# Feature Flags
# Enable the deploy/uptime health probe at GET /rest/v1/health.
# Set to one of 1, true, yes (case-insensitive) to enable; any other value
# (including unset or false) leaves it off and the endpoint returns 404.

CRE_ENABLE_HEALTH=false

# Embeddings

NO_GEN_EMBEDDINGS=false
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ Then edit `.env` and provide values appropriate for your environment.
* Google Auth: `GOOGLE_CLIENT_ID`, `GOOGLE_CLIENT_SECRET`, `GOOGLE_SECRET_JSON`, `LOGIN_ALLOWED_DOMAINS`
* GCP: `GCP_NATIVE`
* Spreadsheet Auth: `OpenCRE_gspread_Auth`
* Feature flags: `CRE_ENABLE_HEALTH` (enable the `GET /rest/v1/health` deploy/uptime probe; off by default, returns 404 when unset)

See `.env.example` for full list and defaults.

Expand Down
48 changes: 48 additions & 0 deletions application/database/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2264,6 +2264,54 @@ def get_root_cres(self):
)
return self._hydrate_cres_batch(list(cres))

def health_check(self) -> Dict[str, Any]:
"""Lightweight liveness/readiness probe for the serving database.

Intended for use by a deploy/uptime health endpoint, NOT for deep
operational checks (GA completeness, mapping coverage, etc.) which are
slow and belong in ops tooling. Performs cheap COUNT queries and never
raises: connectivity failures are reported as ``ok=False`` so the caller
can return an appropriate status code.

Returns a dict with:
- ``ok``: True only if the DB is reachable AND holds a non-empty
dataset (at least one CRE and one standard/node).
- ``db_reachable``: True if the COUNT queries executed.
- ``cre_count`` / ``standards_count``: populated when reachable.
- ``reason``: short human-readable explanation when ``ok`` is False.
"""
try:
cre_count = self.session.query(func.count(CRE.id)).scalar() or 0
standards_count = self.session.query(func.count(Node.id)).scalar() or 0
except OperationalError:
return {
"ok": False,
"db_reachable": False,
"reason": "database unreachable",
}
except Exception: # pragma: no cover - defensive, never fail open
return {
"ok": False,
"db_reachable": False,
"reason": "database health query failed",
}

if cre_count == 0 or standards_count == 0:
return {
"ok": False,
"db_reachable": True,
"cre_count": cre_count,
"standards_count": standards_count,
"reason": "empty dataset",
}

return {
"ok": True,
"db_reachable": True,
"cre_count": cre_count,
"standards_count": standards_count,
}

def get_embeddings_by_doc_type(self, doc_type: str) -> Dict[str, List[float]]:
res = {}
embeddings = (
Expand Down
11 changes: 11 additions & 0 deletions application/feature_flags.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
import os

try:
from dotenv import load_dotenv # type: ignore

load_dotenv()
except ImportError:
pass

TRUE_VALUES = {"1", "true", "yes"}


def is_cre_import_allowed() -> bool:
return os.getenv("CRE_ALLOW_IMPORT", "").strip().lower() in TRUE_VALUES


def is_health_endpoint_enabled() -> bool:
Comment thread
skypank-coder marked this conversation as resolved.
return os.getenv("CRE_ENABLE_HEALTH", "").strip().lower() in TRUE_VALUES
63 changes: 63 additions & 0 deletions application/tests/web_main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1360,3 +1360,66 @@ def test_get_cre_csv(self) -> None:
data.getvalue(),
response.data.decode(),
)

def test_health_disabled_by_default_returns_404(self) -> None:
os.environ.pop("CRE_ENABLE_HEALTH", None)
with self.app.test_client() as client:
response = client.get("/rest/v1/health")
self.assertEqual(404, response.status_code)

def test_health_enabled_empty_dataset_returns_503(self) -> None:
os.environ["CRE_ENABLE_HEALTH"] = "1"
try:
with self.app.test_client() as client:
response = client.get("/rest/v1/health")
self.assertEqual(503, response.status_code)
body = json.loads(response.data.decode())
self.assertFalse(body["ok"])
self.assertTrue(body["db_reachable"])
self.assertEqual("empty dataset", body["reason"])
finally:
os.environ.pop("CRE_ENABLE_HEALTH", None)

def test_health_enabled_populated_returns_200(self) -> None:
os.environ["CRE_ENABLE_HEALTH"] = "1"
try:
collection = db.Node_collection()
collection.add_cre(
defs.CRE(id="111-115", description="CA", name="CA", tags=["ta"])
)
collection.add_node(
defs.Standard(
name="s1", section="s11", subsection="s111", version="1.1.1"
)
)
with self.app.test_client() as client:
response = client.get("/rest/v1/health")
self.assertEqual(200, response.status_code)
body = json.loads(response.data.decode())
self.assertTrue(body["ok"])
self.assertTrue(body["db_reachable"])
self.assertGreaterEqual(body["cre_count"], 1)
self.assertGreaterEqual(body["standards_count"], 1)
finally:
os.environ.pop("CRE_ENABLE_HEALTH", None)

def test_health_db_unreachable_returns_503(self) -> None:
os.environ["CRE_ENABLE_HEALTH"] = "1"
try:
with patch.object(
db.Node_collection,
"health_check",
return_value={
"ok": False,
"db_reachable": False,
"reason": "database unreachable",
},
):
with self.app.test_client() as client:
response = client.get("/rest/v1/health")
self.assertEqual(503, response.status_code)
body = json.loads(response.data.decode())
self.assertFalse(body["ok"])
self.assertFalse(body["db_reachable"])
finally:
os.environ.pop("CRE_ENABLE_HEALTH", None)
23 changes: 22 additions & 1 deletion application/web/web_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from application.cmd import cre_main
from application.defs import cre_defs as defs
from application.defs import cre_exceptions
from application.feature_flags import is_cre_import_allowed
from application.feature_flags import is_cre_import_allowed, is_health_endpoint_enabled

from application.utils import spreadsheet as sheet_utils
from application.utils import mdutils, redirectors, gap_analysis
Expand Down Expand Up @@ -584,6 +584,27 @@ def text_search() -> Any:
abort(404, "No object matches the given search terms")


@app.route("/rest/v1/health", methods=["GET"])
def health() -> Any:
"""Deploy/uptime health probe (feature-flagged, off by default).

Enable with CRE_ENABLE_HEALTH=1. Scope is intentionally narrow and fast so
it can gate deploys without failing for the wrong reason:
- 200: app up, serving DB reachable, dataset non-empty (CREs and
standards present).
- 503: DB unreachable or dataset empty/broken.
Deeper checks (gap-analysis completeness, mapping coverage, Neo4j/Redis)
are deliberately excluded and live in ops tooling instead.
"""
if not is_health_endpoint_enabled():
abort(404)

database = db.Node_collection()
result = database.health_check()
status_code = 200 if result.get("ok") else 503
return jsonify(result), status_code


@app.route("/rest/v1/root_cres", methods=["GET"])
def find_root_cres() -> Any:
"""
Expand Down
Loading