From ade7c1870b9fb3d6ee0e836710d7f38da94b96e8 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan Date: Sun, 31 May 2026 07:01:14 +0530 Subject: [PATCH 1/8] feat: implement structured extraction checkpoints B1 and B2 Signed-off-by: Abhijeet Saharan --- .../external_project_parsers/parsers/cheatsheet_extractor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 384afe932..41b013bad 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,11 +1,9 @@ import os import re - from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" FALLBACK_USED = "false" - CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" _TITLE_RE = re.compile(r"^#\s+(?P.+)$", re.MULTILINE) @@ -53,7 +51,6 @@ def _extract_summary(markdown: str) -> str: if body: return body - break for match in all_heading_matches: From a9e54a3d41e54c3a97d205ecfdd7c2358be1fdd1 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:00:57 +0530 Subject: [PATCH 2/8] docs: add docstrings Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 0e1dc4267..162c4a068 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -78,4 +78,4 @@ def __post_init__(self): raise ValueError( "CheatsheetRecord: metadata keys and values must be strings, " f"got {key!r}: {value!r}" - ) + ) \ No newline at end of file diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 41b013bad..78ef319ea 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,9 +1,11 @@ import os import re + from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" FALLBACK_USED = "false" + CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE) @@ -51,6 +53,7 @@ def _extract_summary(markdown: str) -> str: if body: return body + break for match in all_heading_matches: @@ -59,7 +62,9 @@ def _extract_summary(markdown: str) -> str: if body: return body - raise ValueError("_extract_summary: no summary could be extracted from markdown.") + raise ValueError( + "_extract_summary: no summary could be extracted from markdown." + ) def extract_cheatsheet_record( @@ -90,4 +95,4 @@ def extract_cheatsheet_record( "parser_version": PARSER_VERSION, "fallback_used": FALLBACK_USED, }, - ) + ) \ No newline at end of file From 26d7e92773574916ae8b734dc8b255e0c4e94a2a Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:43:30 +0530 Subject: [PATCH 3/8] fix: validate normalized string field values correctly Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 162c4a068..0e1dc4267 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -78,4 +78,4 @@ def __post_init__(self): raise ValueError( "CheatsheetRecord: metadata keys and values must be strings, " f"got {key!r}: {value!r}" - ) \ No newline at end of file + ) diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 78ef319ea..384afe932 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -62,9 +62,7 @@ def _extract_summary(markdown: str) -> str: if body: return body - raise ValueError( - "_extract_summary: no summary could be extracted from markdown." - ) + raise ValueError("_extract_summary: no summary could be extracted from markdown.") def extract_cheatsheet_record( @@ -95,4 +93,4 @@ def extract_cheatsheet_record( "parser_version": PARSER_VERSION, "fallback_used": FALLBACK_USED, }, - ) \ No newline at end of file + ) From dc9d2d0f39fff76136a7f1285eb04d78f50dec21 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:43:30 +0530 Subject: [PATCH 4/8] fix: validate normalized string field values correctly Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 0e1dc4267..91a9a3b1e 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -43,7 +43,7 @@ def __post_init__(self): } # Validate fields which require string values. - for field_name in required_str_fields: + for field_name in required_str_fields: value = getattr(self, field_name) if not isinstance(value, str) or not value: From a76be6138161d2ba7c8ebc46b3a489372e3e0992 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Mon, 8 Jun 2026 13:31:35 +0530 Subject: [PATCH 5/8] feat: implement structured extraction checkpoint B3 Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 62 ++++++++++++++----- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 91a9a3b1e..0e1dc4267 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -43,7 +43,7 @@ def __post_init__(self): } # Validate fields which require string values. - for field_name in required_str_fields: + for field_name in required_str_fields: value = getattr(self, field_name) if not isinstance(value, str) or not value: diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 384afe932..6b360fecc 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,10 +1,10 @@ +import logging import os import re from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" -FALLBACK_USED = "false" CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" @@ -41,28 +41,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str: def _extract_summary(markdown: str) -> str: - """Extract a summary section from cheatsheet markdown.""" + """Extract summary from Introduction section in cheatsheet markdown.""" - all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown)) - - for match in all_heading_matches: - heading_text = match.group().lstrip("#").strip() - - if heading_text.lower() == "introduction": + for match in _ANY_HEADING_RE.finditer(markdown): + if match.group().lstrip("#").strip().lower() == "introduction": body = _extract_body_after_heading(markdown, match) - if body: return body - break + raise ValueError( + "_extract_summary: no suitable summary section could be extracted from markdown." + ) + - for match in all_heading_matches: - body = _extract_body_after_heading(markdown, match) +def _extract_title(markdown: str) -> str: + """Extract H1 title from cheatsheet markdown.""" + + match = _TITLE_RE.search(markdown) + if not match: + raise ValueError("_extract_title: no title found in markdown.") + + return match.group("title").strip() + +def _fallback_title() -> str: + """Return fallback title for malformed markdown.""" + + return "No title found." + + +def _fallback_summary(markdown: str) -> str: + """Return first non-empty paragraph after any heading, or 'No summary found.'""" + + for match in _ANY_HEADING_RE.finditer(markdown): + body = _extract_body_after_heading(markdown, match) if body: return body - raise ValueError("_extract_summary: no summary could be extracted from markdown.") + return "No summary found." def extract_cheatsheet_record( @@ -71,12 +87,24 @@ def extract_cheatsheet_record( ) -> CheatsheetRecord: """Extract a structured CheatsheetRecord from markdown content.""" - title_match = _TITLE_RE.search(markdown) - title = title_match.group("title").strip() + fallback_used = "false" + + try: + title = _extract_title(markdown) + except ValueError as e: + logging.warning(str(e)) + title = _fallback_title() + fallback_used = "true" + # Headings can be empty. headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)] - summary = _extract_summary(markdown) + try: + summary = _extract_summary(markdown) + except ValueError as e: + logging.warning(str(e)) + summary = _fallback_summary(markdown) + fallback_used = "true" source_id = _derive_source_id(source_path) hyperlink = _derive_hyperlink(source_path) @@ -91,6 +119,6 @@ def extract_cheatsheet_record( category_hints=[], metadata={ "parser_version": PARSER_VERSION, - "fallback_used": FALLBACK_USED, + "fallback_used": fallback_used, }, ) From a5cd582b49ad6b433a51d7b2f812e8885a3fb5b2 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Thu, 11 Jun 2026 13:11:37 +0530 Subject: [PATCH 6/8] feat: add B4 tests for cheatsheet extractor Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- .../tests/cheatsheet_extractor_test.py | 168 ++++++++++++++++++ .../parsers/cheatsheet_extractor.py | 19 +- 2 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 application/tests/cheatsheet_extractor_test.py diff --git a/application/tests/cheatsheet_extractor_test.py b/application/tests/cheatsheet_extractor_test.py new file mode 100644 index 000000000..81ede71f3 --- /dev/null +++ b/application/tests/cheatsheet_extractor_test.py @@ -0,0 +1,168 @@ +import unittest +from application.utils.external_project_parsers.parsers.cheatsheet_extractor import ( + extract_cheatsheet_record, +) +from application.defs.cheatsheet_defs import SUMMARY_MAX_LENGTH + +SOURCE_PATH = "cheatsheets/Secrets_Management_Cheat_Sheet.md" +EXPECTED_SOURCE_ID = "Secrets_Management_Cheat_Sheet" +EXPECTED_HYPERLINK = ( + "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html" +) + +NORMAL_MD = """\ +# Secrets Management Cheat Sheet + +## Introduction +Storage guidance. + +## Architectural Patterns +Use vaults and environment isolation. +""" + +MISSING_H1_MD = """\ +## Introduction +No H1 present. + +## Details +More content. +""" + +EMPTY_MD = "" + +# No ## headings — _extract_summary raises, _fallback_summary matches # via _ANY_HEADING_RE +# and extracts body until len(markdown) since there is no next heading. +BODY_UNDER_H1_MD = """\ +# Single Heading Cheat Sheet + +Body text directly under H1, no subheadings at all. +""" + +# Leading spaces before # and ##malformed (no space) — both handled by \s* and (?!#) regex +MALFORMED_MD = """\ + # Malformed Title + +##malformed + +## Introduction +Some intro text. + +## Valid Heading +""" + + +class TestNormal(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH) + + # source, source_id, hyperlink, raw_markdown_path are derived from SOURCE_PATH + # and are independent of markdown content — verified once here for all cases + def test_source(self): + self.assertEqual(self.record.source, "owasp_cheatsheets") + + def test_source_id(self): + self.assertEqual(self.record.source_id, EXPECTED_SOURCE_ID) + + def test_hyperlink(self): + self.assertEqual(self.record.hyperlink, EXPECTED_HYPERLINK) + + def test_raw_markdown_path(self): + self.assertEqual(self.record.raw_markdown_path, SOURCE_PATH) + + def test_title(self): + self.assertEqual(self.record.title, "Secrets Management Cheat Sheet") + + def test_summary(self): + self.assertEqual(self.record.summary, "Storage guidance.") + + def test_summary_bounded(self): + # SUMMARY_MAX_LENGTH truncation happens in CheatsheetRecord.__post_init__ + # for every record — testing once here covers all cases + self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH) + + def test_headings(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Architectural Patterns", self.record.headings) + + def test_fallback_not_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "false") + + +class TestMissingH1(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(MISSING_H1_MD, SOURCE_PATH) + + def test_title_is_fallback(self): + self.assertEqual(self.record.title, "No title found.") + + def test_summary_from_introduction(self): + self.assertIn("no h1", self.record.summary.lower()) + + def test_headings_extracted(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Details", self.record.headings) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestEmptyMarkdown(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(EMPTY_MD, SOURCE_PATH) + + def test_title_is_fallback(self): + self.assertEqual(self.record.title, "No title found.") + + def test_summary_no_summary_found(self): + # No headings at all — _fallback_summary returns this literal string + self.assertEqual(self.record.summary, "No summary found.") + + def test_headings_empty(self): + self.assertEqual(self.record.headings, []) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestBodyUnderH1(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(BODY_UNDER_H1_MD, SOURCE_PATH) + + def test_title(self): + self.assertEqual(self.record.title, "Single Heading Cheat Sheet") + + def test_summary_from_fallback_via_h1(self): + # _fallback_summary matches # heading, extracts body until len(markdown) + self.assertIn("body text", self.record.summary.lower()) + + def test_headings_empty(self): + # _HEADING_RE only matches ## — no ## present here + self.assertEqual(self.record.headings, []) + + def test_fallback_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "true") + + +class TestMalformedHeadings(unittest.TestCase): + def setUp(self): + self.record = extract_cheatsheet_record(MALFORMED_MD, SOURCE_PATH) + + def test_malformed_h1_extracted(self): + self.assertEqual(self.record.title, "Malformed Title") + + def test_malformed_h2_in_headings(self): + self.assertIn("malformed", self.record.headings) + + def test_valid_headings_also_extracted(self): + self.assertIn("Introduction", self.record.headings) + self.assertIn("Valid Heading", self.record.headings) + + def test_summary_from_introduction(self): + self.assertIn("intro", self.record.summary.lower()) + + def test_fallback_not_used(self): + self.assertEqual(self.record.metadata["fallback_used"], "false") + + +if __name__ == "__main__": + unittest.main() diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 6b360fecc..f6e555207 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -8,9 +8,20 @@ CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" -_TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE) -_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE) -_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE) +_TITLE_RE = re.compile( + r"^\s*#(?!#)\s*(?P<title>.+?)$", + re.MULTILINE, +) + +_HEADING_RE = re.compile( + r"^\s*##(?!#)\s*(?P<heading>.+?)$", + re.MULTILINE, +) + +_ANY_HEADING_RE = re.compile( + r"^\s*#{1,6}(?!#)\s*.+?$", + re.MULTILINE, +) def _derive_source_id(source_path: str) -> str: @@ -44,7 +55,7 @@ def _extract_summary(markdown: str) -> str: """Extract summary from Introduction section in cheatsheet markdown.""" for match in _ANY_HEADING_RE.finditer(markdown): - if match.group().lstrip("#").strip().lower() == "introduction": + if match.group().strip().lstrip("#").strip().lower() == "introduction": body = _extract_body_after_heading(markdown, match) if body: return body From 27a7f44851487fd312ea8fbbfb47860a7cdb53d2 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Sun, 14 Jun 2026 14:09:53 +0530 Subject: [PATCH 7/8] docs: add checkpoint B5 documentation and refine test comments Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- .../tests/cheatsheet_extractor_test.py | 21 +- docs/rfc-structured-extraction.md | 258 ++++++++++++++++++ 2 files changed, 269 insertions(+), 10 deletions(-) create mode 100644 docs/rfc-structured-extraction.md diff --git a/application/tests/cheatsheet_extractor_test.py b/application/tests/cheatsheet_extractor_test.py index 81ede71f3..ea159257f 100644 --- a/application/tests/cheatsheet_extractor_test.py +++ b/application/tests/cheatsheet_extractor_test.py @@ -30,15 +30,16 @@ EMPTY_MD = "" -# No ## headings — _extract_summary raises, _fallback_summary matches # via _ANY_HEADING_RE -# and extracts body until len(markdown) since there is no next heading. +# No ## Introduction heading exists, so summary extraction +# falls back to the first available body content. BODY_UNDER_H1_MD = """\ # Single Heading Cheat Sheet Body text directly under H1, no subheadings at all. """ -# Leading spaces before # and ##malformed (no space) — both handled by \s* and (?!#) regex +# Leading whitespace before H1 and malformed ## headings +# should still be normalized and extracted correctly. MALFORMED_MD = """\ # Malformed Title @@ -55,8 +56,8 @@ class TestNormal(unittest.TestCase): def setUp(self): self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH) - # source, source_id, hyperlink, raw_markdown_path are derived from SOURCE_PATH - # and are independent of markdown content — verified once here for all cases + # source-derived fields should remain deterministic and + # independent of markdown content across all extraction paths. def test_source(self): self.assertEqual(self.record.source, "owasp_cheatsheets") @@ -76,8 +77,8 @@ def test_summary(self): self.assertEqual(self.record.summary, "Storage guidance.") def test_summary_bounded(self): - # SUMMARY_MAX_LENGTH truncation happens in CheatsheetRecord.__post_init__ - # for every record — testing once here covers all cases + # Summary truncation is enforced centrally via + # CheatsheetRecord.__post_init__. self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH) def test_headings(self): @@ -114,7 +115,7 @@ def test_title_is_fallback(self): self.assertEqual(self.record.title, "No title found.") def test_summary_no_summary_found(self): - # No headings at all — _fallback_summary returns this literal string + # Empty markdown should trigger terminal summary fallback. self.assertEqual(self.record.summary, "No summary found.") def test_headings_empty(self): @@ -132,11 +133,11 @@ def test_title(self): self.assertEqual(self.record.title, "Single Heading Cheat Sheet") def test_summary_from_fallback_via_h1(self): - # _fallback_summary matches # heading, extracts body until len(markdown) + # Summary fallback should extract body content beneath the H1 section. self.assertIn("body text", self.record.summary.lower()) def test_headings_empty(self): - # _HEADING_RE only matches ## — no ## present here + # No valid ## headings should produce an empty headings list. self.assertEqual(self.record.headings, []) def test_fallback_used(self): diff --git a/docs/rfc-structured-extraction.md b/docs/rfc-structured-extraction.md new file mode 100644 index 000000000..cb2913b34 --- /dev/null +++ b/docs/rfc-structured-extraction.md @@ -0,0 +1,258 @@ +# RFC Workstream B — Structured Extraction + +This document explains the implementation and behavior of RFC Workstream B +(Structured Extraction) from Cheatsheet to CRE Mapping RFC. + +The goal of this module is to convert OWASP Cheat Sheet markdown into a +deterministic structured object that downstream RFC workstreams can consume +for categorization, retrieval, reranking, and mapping generation. + +The implementation is primarily located in: + +* `cheatsheet_defs.py` +* `cheatsheet_extractor.py` + +--- + +## Sources for more context + +* RFC: + `docs/rfc/cheatsheets-llm-autonomous-mapping-rfc.md` + +* Checkpoints B1 & B2 implementation PR: + `https://github.com/OWASP/OpenCRE/pull/912` + +* Checkpoints B3 & B4 implementation PR: + `https://github.com/OWASP/OpenCRE/pull/921` + +--- + +## What Workstream B implements + +**The implementation strictly follows the RFC extraction contract and prioritizes deterministic extraction behavior.** + +It defines a typed dataclass named `CheatsheetRecord`. + +This object represents the structured extraction result returned from: + +```python +extract_cheatsheet_record(markdown, source_path) +``` + +The extractor parses OWASP Cheat Sheet markdown and returns normalized +structured information about a cheatsheet. + +`CheatsheetRecord` contains: + +* `source` +* `source_id` +* `title` +* `hyperlink` +* `summary` +* `headings` +* `raw_markdown_path` +* `category_hints` +* `metadata` + +--- + +## Fallback behavior + +The extractor contains fallback functions capable of handling incomplete or +malformed markdown containing: + +* missing titles, +* missing summary sources, +* malformed headings. + +These fallback paths ensure that extraction still returns a valid +`CheatsheetRecord` object instead of failing entirely. + +Fallback behavior is explicitly surfaced through: + +```json +"metadata": { + "fallback_used": "true" +} +``` + +This allows downstream workstreams to identify records that required fallback +logic during extraction and downstream normalization. + +--- + +## Fallback decision tree + +```text +extract_cheatsheet_record(markdown, source_path) + +│ +├── _extract_title(markdown) +│ ├── H1 title exists +│ │ → extract and normalize title +│ │ +│ └── H1 title missing +│ → _fallback_title() +│ → "No title found." +│ → metadata["fallback_used"] = "true" +│ +└── _extract_summary(markdown) + ├── "Introduction" heading exists with body content + │ → body beneath "Introduction" extracted as summary + │ → summary normalized and truncated upto specifc length. + │ + └── Introduction section missing or invalid + → _fallback_summary(markdown) + │ + ├── first heading with body content exists + │ → its body returned as summary + │ + └── no usable heading/body content exists + → "No summary found." + → metadata["fallback_used"] = "true" +``` + +--- + +## Extraction examples + +The following examples demonstrate deterministic extractor behavior across +different markdown shapes. + +Notes: + +* Currently, `category_hints` is intentionally returned as an initial empty + list during v1. + +* `raw_markdown_path`, `hyperlink`, and `source_id` are derived from + `source_path` (Module A) and are independent of markdown content. + +--- + +## 1. Normal cheat sheet + +### Example Input + +```markdown +# Secrets Management Cheat Sheet + +## Introduction +Storage guidance. + +## Architectural Patterns +Use vaults and environment isolation. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Secrets_Management_Cheat_Sheet", + "title": "Secrets Management Cheat Sheet", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html", + "summary": "Storage guidance.", + "headings": ["Introduction", "Architectural Patterns"], + "raw_markdown_path": "cheatsheets/Secrets_Management_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "false" + } +} +``` + +### Notes + +* No fallback logic was required. + +--- + +## 2. Missing H1 (fallback title) + +### Input + +```markdown +## Introduction +No H1 present. + +## Details +More content. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Example_Cheat_Sheet", + "title": "No title found.", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Example_Cheat_Sheet.html", + "summary": "No H1 present.", + "headings": ["Introduction", "Details"], + "raw_markdown_path": "cheatsheets/Example_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "true" + } +} +``` + +### Notes + +* No H1 title exists, so the title defaults to `"No title found."` + +--- + +## 3. Missing Introduction section (summary fallback) + +### Input + +```markdown +# Single Heading Cheat Sheet + +## Authentication + +### Storage +Secrets should be encrypted. +``` + +### Output + +```json +{ + "source": "owasp_cheatsheets", + "source_id": "Single_Heading_Cheat_Sheet", + "title": "Single Heading Cheat Sheet", + "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Single_Heading_Cheat_Sheet.html", + "summary": "Secrets should be encrypted.", + "headings": ["Authentication"], + "raw_markdown_path": "cheatsheets/Single_Heading_Cheat_Sheet.md", + "category_hints": [], + "metadata": { + "parser_version": "v1", + "fallback_used": "true" + } +} +``` + +### Notes + +* No `Introduction` heading exists, so summary fallback logic is used. +* The fallback scans all headings and returns the first non-empty body it + finds — in this case the content beneath `### Storage`. +* Only `##`-level headings appear in `headings` — `### Storage` is excluded. + +--- + +## Additional behavior notes + +It is ensured that markdown files with malformed title/headings such as: + +* With leading whitespace (e.g. ` # My Title`, ` ## My Heading`) +* No space after the marker (e.g. `##Authentication`) + +are extracted correctly. + +--- \ No newline at end of file From 8a3ac8feb0a90363472ffcd9177e68357ff2da5c Mon Sep 17 00:00:00 2001 From: Abhijeet <abhijeetsaharan2236@gmail.com> Date: Sun, 14 Jun 2026 14:49:05 +0530 Subject: [PATCH 8/8] docs: refine malformed heading behavior notes Signed-off-by: Abhijeet <abhijeetsaharan2236@gmail.com> --- docs/rfc-structured-extraction.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/rfc-structured-extraction.md b/docs/rfc-structured-extraction.md index cb2913b34..06296f5ec 100644 --- a/docs/rfc-structured-extraction.md +++ b/docs/rfc-structured-extraction.md @@ -248,11 +248,9 @@ Secrets should be encrypted. ## Additional behavior notes -It is ensured that markdown files with malformed title/headings such as: +The extractor correctly handles markdown files with malformed titles/headings such as: -* With leading whitespace (e.g. ` # My Title`, ` ## My Heading`) +* Titles with leading whitespace * No space after the marker (e.g. `##Authentication`) -are extracted correctly. - ---- \ No newline at end of file +---