From 3290700485086f94204afacf0b865188cbf9f42a Mon Sep 17 00:00:00 2001 From: Lars George Date: Fri, 19 Jun 2026 17:54:42 +0200 Subject: [PATCH] feat(search): index business terms via SemanticModelsManager Wire SemanticModelsManager into the global search index as a SearchableAsset so ontology concepts / glossary terms appear in main search results. - Decorate manager with @searchable_asset and inherit SearchableAsset. - Add get_search_index_items() emitting type=glossary-term entries with link, tags, and synonyms in extra_data so the existing search config (search_config.yaml: glossary-term with synonyms boost) is honored. - Link uses direct /concepts/browser/ path to land straight on ConceptDetailView, matching concept-detail.tsx convention and avoiding the deprecated ?concept= redirect hop documented in business-terms.tsx. Incremental updates (no full rebuild needed on each edit): - _make_glossary_search_item: single factory shared by the bulk indexer and the per-mutation upsert path so live edits stay shape-consistent with the startup snapshot. - _upsert_concept_in_search / _remove_concept_from_search: live-update helpers wired into create_concept, update_concept, update_concept_status, and delete_concept. - delete_collection captures concept IRIs before triple wipe, then evicts each from the search index. - import_rdf_to_collection upserts every subject from the parsed graph. - _reindex_concepts_in_search hooks into _build_persistent_caches_atomic to cover all whole-graph rebuild paths (taxonomy enable/disable, semantic model file CRUD) without per-route plumbing. Rationale on memory / SPARQL alternative: a SearchIndexItem is small relative to the OntologyConcept and rdflib.ConjunctiveGraph already in memory, so per-item duplication is negligible at typical scales. A delegated per-query SPARQL search would only pay off at 50k+ concepts and would be a generic SearchableAsset redesign, not a glossary-specific shortcut. Verified end-to-end via Playwright: - 502 items in index after startup (was 2; +500 glossary terms). - POST/PATCH/DELETE of a concept reflects live in /api/search without triggering a SearchManager.build_index(). - Search-result click navigates straight to ConceptDetailView for both urn: and http://...#fragment IRIs (no ?concept= intermediate). --- .../src/controller/semantic_models_manager.py | 208 ++++++++++++++++-- 1 file changed, 190 insertions(+), 18 deletions(-) diff --git a/src/backend/src/controller/semantic_models_manager.py b/src/backend/src/controller/semantic_models_manager.py index 2b73b41c..5796163a 100644 --- a/src/backend/src/controller/semantic_models_manager.py +++ b/src/backend/src/controller/semantic_models_manager.py @@ -1192,6 +1192,11 @@ def _build_persistent_caches_atomic(self) -> None: shutil.rmtree(temp_dir) raise + # Outside the FileLock: refresh the live search index from the freshly + # rebuilt _cached_concepts. No-op when SearchManager isn't wired yet + # (startup path — SearchManager.build_index() runs once afterwards). + self._reindex_concepts_in_search() + def _compute_taxonomies(self) -> List: """Compute taxonomies without caching - used for building persistent cache""" from src.models.ontology import SemanticModel as SemanticModelOntology @@ -2563,6 +2568,47 @@ def get_grouped_concepts(self) -> Dict[str, List[OntologyConcept]]: return grouped + # ------------------------------------------------------------------ + # Global search integration (SearchableAsset) + # ------------------------------------------------------------------ + + @staticmethod + def _make_glossary_search_item( + iri: str, + label: Optional[str], + comment: Optional[str], + concept_type: Optional[str], + synonyms: Optional[List[str]], + source_context_label: Optional[str], + ) -> Optional[SearchIndexItem]: + """Single source of truth for mapping a concept to a SearchIndexItem. + + Shared by the bulk path (``get_search_index_items``) and the per-mutation + path (``_upsert_concept_in_search``) so live edits stay shape-consistent + with the startup snapshot. + """ + if not iri: + return None + title = label or iri.rsplit('#', 1)[-1].rsplit('/', 1)[-1] + tags: List[str] = [] + if source_context_label and source_context_label != "Unassigned": + tags.append(source_context_label) + if concept_type: + tags.append(concept_type) + return SearchIndexItem( + id=f"glossary-term::{iri}", + type="glossary-term", + title=title, + description=comment, + # Direct path to ConceptDetailView (route 'browser/:iri'). Avoids + # the BusinessTermsView ?concept= redirect hop and matches the + # convention used by concept-detail.tsx / business-terms.tsx. + link=f"/concepts/browser/{urllib.parse.quote(iri, safe='')}", + tags=tags, + feature_id="semantic-models", + extra_data={"synonyms": " ".join(synonyms or [])}, + ) + def get_search_index_items(self) -> List[SearchIndexItem]: """Build SearchIndexItem entries for every indexed ontology concept. @@ -2576,29 +2622,127 @@ def get_search_index_items(self) -> List[SearchIndexItem]: grouped = self.get_grouped_concepts() for source_context, concepts in grouped.items(): for c in concepts: - if not c.iri: - continue - title = c.label or c.iri.rsplit('#', 1)[-1].rsplit('/', 1)[-1] - tags: List[str] = [] - if source_context and source_context != "Unassigned": - tags.append(source_context) - if c.concept_type: - tags.append(c.concept_type) - items.append(SearchIndexItem( - id=f"glossary-term::{c.iri}", - type="glossary-term", - title=title, - description=c.comment, - link=f"/concepts/browser?concept={urllib.parse.quote(c.iri, safe='')}", - tags=tags, - feature_id="semantic-models", - extra_data={"synonyms": " ".join(c.synonyms or [])}, - )) + item = self._make_glossary_search_item( + iri=c.iri, + label=c.label, + comment=c.comment, + concept_type=c.concept_type, + synonyms=c.synonyms, + source_context_label=source_context, + ) + if item: + items.append(item) logger.info(f"Prepared {len(items)} glossary terms for search index.") except Exception as e: logger.error(f"Failed to build glossary-term search index: {e}", exc_info=True) return items + def _upsert_concept_in_search(self, concept_iri: str) -> None: + """Push a single concept into the live search index. + + Safe no-op until ``SearchManager`` is wired (e.g., during startup). + Reads the concept via ``get_concept`` so it works even when + ``_cached_concepts`` is None (post-invalidation). + """ + if self._search_manager is None or not concept_iri: + return + try: + concept = self.get_concept(concept_iri) + if not concept: + return + raw_source = concept.get("source_context") + friendly_source = ( + self._extract_source_context(raw_source) if raw_source else None + ) or "Unassigned" + item = self._make_glossary_search_item( + iri=concept["iri"], + label=concept.get("label"), + comment=concept.get("comment"), + concept_type=concept.get("concept_type"), + synonyms=concept.get("synonyms"), + source_context_label=friendly_source, + ) + if item: + self._notify_index_upsert(item) + except Exception as e: + logger.warning( + f"Failed to upsert concept {concept_iri} in search index: {e}" + ) + + def _remove_concept_from_search(self, concept_iri: str) -> None: + """Remove a single concept from the live search index. Safe no-op.""" + if self._search_manager is None or not concept_iri: + return + try: + self._notify_index_remove(f"glossary-term::{concept_iri}") + except Exception as e: + logger.warning( + f"Failed to remove concept {concept_iri} from search index: {e}" + ) + + def _reindex_concepts_in_search(self) -> None: + """Purge all glossary-term entries and re-add from ``_cached_concepts``. + + Used after whole-graph rebuilds (taxonomy enable/disable, semantic model + upload/delete, etc.) to keep the search index consistent without forcing + a full ``SearchManager.build_index()``. No-op when the SearchManager is + not yet wired (startup) or when ``_cached_concepts`` is unavailable. + """ + if self._search_manager is None: + return + if self._cached_concepts is None: + return + try: + # Purge stale glossary-term entries (handles deletions / disables) + for existing in list(self._search_manager.index): + if getattr(existing, "type", None) == "glossary-term": + self._notify_index_remove(existing.id) + count = 0 + for c in self._cached_concepts: + source_label = c.source_context or "Unassigned" + item = self._make_glossary_search_item( + iri=c.iri, + label=c.label, + comment=c.comment, + concept_type=c.concept_type, + synonyms=c.synonyms, + source_context_label=source_label, + ) + if item: + self._notify_index_upsert(item) + count += 1 + logger.info( + f"Reindexed {count} glossary terms in search after bulk rebuild" + ) + except Exception as e: + logger.error( + f"Failed to reindex glossary terms in search: {e}", exc_info=True + ) + + def _collect_concept_iris_in_context(self, context_iri: str) -> List[str]: + """Return all concept-like subject IRIs in a given named-graph context. + + Used by ``delete_collection`` to capture what to evict from the search + index before the underlying triples are wiped. + """ + iris: List[str] = [] + try: + ctx = self._graph.get_context(URIRef(context_iri)) + seen: set = set() + for subj in ctx.subjects(RDF.type, None): + s = str(subj) + if isinstance(subj, BNode): + continue + if s in seen: + continue + seen.add(s) + iris.append(s) + except Exception as e: + logger.debug( + f"Failed to enumerate concept IRIs in context {context_iri}: {e}" + ) + return iris + def get_properties_grouped(self) -> Dict[str, List[Dict[str, Any]]]: """Return all RDF/OWL properties grouped by their source context name. @@ -3054,6 +3198,11 @@ def delete_collection(self, collection_iri: str, deleted_by: Optional[str] = Non if existing.get("source_type") == "imported": raise ValueError("Cannot delete imported collections. Disable editing instead.") + # Snapshot concept IRIs in this collection BEFORE we wipe the context, + # so we can evict each from the live search index. Must happen before + # the in-memory graph removal a few lines below. + concept_iris_to_evict = self._collect_concept_iris_in_context(collection_iri) + # Remove collection metadata from meta context rdf_triples_repo.remove_by_subject(self._db, collection_iri, META_CONTEXT) @@ -3076,6 +3225,10 @@ def delete_collection(self, collection_iri: str, deleted_by: Optional[str] = Non self._db.commit() self._invalidate_cache() + # Evict every concept that lived under this collection from search. + for iri in concept_iris_to_evict: + self._remove_concept_from_search(iri) + return True # ======================================================================== @@ -3340,6 +3493,7 @@ def create_concept( logger.warning(f"Failed to add owner {owner_user} to concept: {e}") self._invalidate_cache() + self._upsert_concept_in_search(concept_iri) return self.get_concept(concept_iri) @@ -3723,6 +3877,7 @@ def update_concept( self._db.commit() self._invalidate_cache() + self._upsert_concept_in_search(concept_iri) return self.get_concept(concept_iri) @@ -3766,6 +3921,7 @@ def delete_concept(self, concept_iri: str, deleted_by: Optional[str] = None) -> self._db.commit() self._invalidate_cache() + self._remove_concept_from_search(concept_iri) return True @@ -4019,6 +4175,7 @@ def update_concept_status( self._db.commit() self._invalidate_cache() + self._upsert_concept_in_search(concept_iri) return self.get_concept(concept_iri) @@ -4147,6 +4304,21 @@ def import_rdf_to_collection( self._invalidate_cache() + # Upsert each imported concept into the live search index. We pull + # subjects from the just-parsed temp_graph (rather than re-querying + # the whole context) so a partial import doesn't reindex unrelated + # pre-existing concepts. + if self._search_manager is not None: + seen: set = set() + for subj in temp_graph.subjects(RDF.type, None): + if isinstance(subj, BNode): + continue + s = str(subj) + if s in seen: + continue + seen.add(s) + self._upsert_concept_in_search(s) + return count # ========================================================================