From ade7c1870b9fb3d6ee0e836710d7f38da94b96e8 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Sun, 31 May 2026 07:01:14 +0530
Subject: [PATCH 1/8] feat: implement structured extraction checkpoints B1 and
 B2

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 .../external_project_parsers/parsers/cheatsheet_extractor.py   | 3 ---
 1 file changed, 3 deletions(-)
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 384afe932..41b013bad 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,11 +1,9 @@
 import os
 import re
-
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
 FALLBACK_USED = "false"
-
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
 _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
@@ -53,7 +51,6 @@ def _extract_summary(markdown: str) -> str:
 
             if body:
                 return body
-
             break
 
     for match in all_heading_matches:

From a9e54a3d41e54c3a97d205ecfdd7c2358be1fdd1 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:00:57 +0530
Subject: [PATCH 2/8] docs: add docstrings

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py                      | 2 +-
 .../parsers/cheatsheet_extractor.py                      | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 0e1dc4267..162c4a068 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -78,4 +78,4 @@ def __post_init__(self):
                 raise ValueError(
                     "CheatsheetRecord: metadata keys and values must be strings, "
                     f"got {key!r}: {value!r}"
-                )
+                )
\ No newline at end of file
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 41b013bad..78ef319ea 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,9 +1,11 @@
 import os
 import re
+
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
 FALLBACK_USED = "false"
+
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
 _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
@@ -51,6 +53,7 @@ def _extract_summary(markdown: str) -> str:
 
             if body:
                 return body
+
             break
 
     for match in all_heading_matches:
@@ -59,7 +62,9 @@ def _extract_summary(markdown: str) -> str:
         if body:
             return body
 
-    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
+    raise ValueError(
+        "_extract_summary: no summary could be extracted from markdown."
+    )
 
 
 def extract_cheatsheet_record(
@@ -90,4 +95,4 @@ def extract_cheatsheet_record(
             "parser_version": PARSER_VERSION,
             "fallback_used": FALLBACK_USED,
         },
-    )
+    )
\ No newline at end of file

From 26d7e92773574916ae8b734dc8b255e0c4e94a2a Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:43:30 +0530
Subject: [PATCH 3/8] fix: validate normalized string field values correctly

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py                         | 2 +-
 .../parsers/cheatsheet_extractor.py                         | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 162c4a068..0e1dc4267 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -78,4 +78,4 @@ def __post_init__(self):
                 raise ValueError(
                     "CheatsheetRecord: metadata keys and values must be strings, "
                     f"got {key!r}: {value!r}"
-                )
\ No newline at end of file
+                )
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 78ef319ea..384afe932 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -62,9 +62,7 @@ def _extract_summary(markdown: str) -> str:
         if body:
             return body
 
-    raise ValueError(
-        "_extract_summary: no summary could be extracted from markdown."
-    )
+    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
 
 
 def extract_cheatsheet_record(
@@ -95,4 +93,4 @@ def extract_cheatsheet_record(
             "parser_version": PARSER_VERSION,
             "fallback_used": FALLBACK_USED,
         },
-    )
\ No newline at end of file
+    )

From dc9d2d0f39fff76136a7f1285eb04d78f50dec21 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:43:30 +0530
Subject: [PATCH 4/8] fix: validate normalized string field values correctly

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 0e1dc4267..91a9a3b1e 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -43,7 +43,7 @@ def __post_init__(self):
         }
 
         # Validate fields which require string values.
-        for field_name in required_str_fields:
+        for field_name in required_str_fields:  
             value = getattr(self, field_name)
 
             if not isinstance(value, str) or not value:

From a76be6138161d2ba7c8ebc46b3a489372e3e0992 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Mon, 8 Jun 2026 13:31:35 +0530
Subject: [PATCH 5/8] feat: implement structured extraction checkpoint B3

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py           |  2 +-
 .../parsers/cheatsheet_extractor.py           | 62 ++++++++++++++-----
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 91a9a3b1e..0e1dc4267 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -43,7 +43,7 @@ def __post_init__(self):
         }
 
         # Validate fields which require string values.
-        for field_name in required_str_fields:  
+        for field_name in required_str_fields:
             value = getattr(self, field_name)
 
             if not isinstance(value, str) or not value:
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 384afe932..6b360fecc 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,10 +1,10 @@
+import logging
 import os
 import re
 
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
-FALLBACK_USED = "false"
 
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
@@ -41,28 +41,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:
 
 
 def _extract_summary(markdown: str) -> str:
-    """Extract a summary section from cheatsheet markdown."""
+    """Extract summary from Introduction section in cheatsheet markdown."""
 
-    all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))
-
-    for match in all_heading_matches:
-        heading_text = match.group().lstrip("#").strip()
-
-        if heading_text.lower() == "introduction":
+    for match in _ANY_HEADING_RE.finditer(markdown):
+        if match.group().lstrip("#").strip().lower() == "introduction":
             body = _extract_body_after_heading(markdown, match)
-
             if body:
                 return body
 
-            break
+    raise ValueError(
+        "_extract_summary: no suitable summary section could be extracted from markdown."
+    )
+
 
-    for match in all_heading_matches:
-        body = _extract_body_after_heading(markdown, match)
+def _extract_title(markdown: str) -> str:
+    """Extract H1 title from cheatsheet markdown."""
+
+    match = _TITLE_RE.search(markdown)
+    if not match:
+        raise ValueError("_extract_title: no title found in markdown.")
+
+    return match.group("title").strip()
 
+
+def _fallback_title() -> str:
+    """Return fallback title for malformed markdown."""
+
+    return "No title found."
+
+
+def _fallback_summary(markdown: str) -> str:
+    """Return first non-empty paragraph after any heading, or 'No summary found.'"""
+
+    for match in _ANY_HEADING_RE.finditer(markdown):
+        body = _extract_body_after_heading(markdown, match)
         if body:
             return body
 
-    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
+    return "No summary found."
 
 
 def extract_cheatsheet_record(
@@ -71,12 +87,24 @@ def extract_cheatsheet_record(
 ) -> CheatsheetRecord:
     """Extract a structured CheatsheetRecord from markdown content."""
 
-    title_match = _TITLE_RE.search(markdown)
-    title = title_match.group("title").strip()
+    fallback_used = "false"
+
+    try:
+        title = _extract_title(markdown)
+    except ValueError as e:
+        logging.warning(str(e))
+        title = _fallback_title()
+        fallback_used = "true"
 
+    # Headings can be empty.
     headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]
 
-    summary = _extract_summary(markdown)
+    try:
+        summary = _extract_summary(markdown)
+    except ValueError as e:
+        logging.warning(str(e))
+        summary = _fallback_summary(markdown)
+        fallback_used = "true"
 
     source_id = _derive_source_id(source_path)
     hyperlink = _derive_hyperlink(source_path)
@@ -91,6 +119,6 @@ def extract_cheatsheet_record(
         category_hints=[],
         metadata={
             "parser_version": PARSER_VERSION,
-            "fallback_used": FALLBACK_USED,
+            "fallback_used": fallback_used,
         },
     )

From a5cd582b49ad6b433a51d7b2f812e8885a3fb5b2 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Thu, 11 Jun 2026 13:11:37 +0530
Subject: [PATCH 6/8] feat: add B4 tests for cheatsheet extractor

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 .../tests/cheatsheet_extractor_test.py        | 168 ++++++++++++++++++
 .../parsers/cheatsheet_extractor.py           |  19 +-
 2 files changed, 183 insertions(+), 4 deletions(-)
 create mode 100644 application/tests/cheatsheet_extractor_test.py

diff --git a/application/tests/cheatsheet_extractor_test.py b/application/tests/cheatsheet_extractor_test.py
new file mode 100644
index 000000000..81ede71f3
--- /dev/null
+++ b/application/tests/cheatsheet_extractor_test.py
@@ -0,0 +1,168 @@
+import unittest
+from application.utils.external_project_parsers.parsers.cheatsheet_extractor import (
+    extract_cheatsheet_record,
+)
+from application.defs.cheatsheet_defs import SUMMARY_MAX_LENGTH
+
+SOURCE_PATH = "cheatsheets/Secrets_Management_Cheat_Sheet.md"
+EXPECTED_SOURCE_ID = "Secrets_Management_Cheat_Sheet"
+EXPECTED_HYPERLINK = (
+    "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html"
+)
+
+NORMAL_MD = """\
+# Secrets Management Cheat Sheet
+
+## Introduction
+Storage guidance.
+
+## Architectural Patterns
+Use vaults and environment isolation.
+"""
+
+MISSING_H1_MD = """\
+## Introduction
+No H1 present.
+
+## Details
+More content.
+"""
+
+EMPTY_MD = ""
+
+# No ## headings — _extract_summary raises, _fallback_summary matches # via _ANY_HEADING_RE
+# and extracts body until len(markdown) since there is no next heading.
+BODY_UNDER_H1_MD = """\
+# Single Heading Cheat Sheet
+
+Body text directly under H1, no subheadings at all.
+"""
+
+# Leading spaces before # and ##malformed (no space) — both handled by \s* and (?!#) regex
+MALFORMED_MD = """\
+   # Malformed Title
+
+##malformed
+
+## Introduction
+Some intro text.
+
+## Valid Heading
+"""
+
+
+class TestNormal(unittest.TestCase):
+    def setUp(self):
+        self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH)
+
+    # source, source_id, hyperlink, raw_markdown_path are derived from SOURCE_PATH
+    # and are independent of markdown content — verified once here for all cases
+    def test_source(self):
+        self.assertEqual(self.record.source, "owasp_cheatsheets")
+
+    def test_source_id(self):
+        self.assertEqual(self.record.source_id, EXPECTED_SOURCE_ID)
+
+    def test_hyperlink(self):
+        self.assertEqual(self.record.hyperlink, EXPECTED_HYPERLINK)
+
+    def test_raw_markdown_path(self):
+        self.assertEqual(self.record.raw_markdown_path, SOURCE_PATH)
+
+    def test_title(self):
+        self.assertEqual(self.record.title, "Secrets Management Cheat Sheet")
+
+    def test_summary(self):
+        self.assertEqual(self.record.summary, "Storage guidance.")
+
+    def test_summary_bounded(self):
+        # SUMMARY_MAX_LENGTH truncation happens in CheatsheetRecord.__post_init__
+        # for every record — testing once here covers all cases
+        self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH)
+
+    def test_headings(self):
+        self.assertIn("Introduction", self.record.headings)
+        self.assertIn("Architectural Patterns", self.record.headings)
+
+    def test_fallback_not_used(self):
+        self.assertEqual(self.record.metadata["fallback_used"], "false")
+
+
+class TestMissingH1(unittest.TestCase):
+    def setUp(self):
+        self.record = extract_cheatsheet_record(MISSING_H1_MD, SOURCE_PATH)
+
+    def test_title_is_fallback(self):
+        self.assertEqual(self.record.title, "No title found.")
+
+    def test_summary_from_introduction(self):
+        self.assertIn("no h1", self.record.summary.lower())
+
+    def test_headings_extracted(self):
+        self.assertIn("Introduction", self.record.headings)
+        self.assertIn("Details", self.record.headings)
+
+    def test_fallback_used(self):
+        self.assertEqual(self.record.metadata["fallback_used"], "true")
+
+
+class TestEmptyMarkdown(unittest.TestCase):
+    def setUp(self):
+        self.record = extract_cheatsheet_record(EMPTY_MD, SOURCE_PATH)
+
+    def test_title_is_fallback(self):
+        self.assertEqual(self.record.title, "No title found.")
+
+    def test_summary_no_summary_found(self):
+        # No headings at all — _fallback_summary returns this literal string
+        self.assertEqual(self.record.summary, "No summary found.")
+
+    def test_headings_empty(self):
+        self.assertEqual(self.record.headings, [])
+
+    def test_fallback_used(self):
+        self.assertEqual(self.record.metadata["fallback_used"], "true")
+
+
+class TestBodyUnderH1(unittest.TestCase):
+    def setUp(self):
+        self.record = extract_cheatsheet_record(BODY_UNDER_H1_MD, SOURCE_PATH)
+
+    def test_title(self):
+        self.assertEqual(self.record.title, "Single Heading Cheat Sheet")
+
+    def test_summary_from_fallback_via_h1(self):
+        # _fallback_summary matches # heading, extracts body until len(markdown)
+        self.assertIn("body text", self.record.summary.lower())
+
+    def test_headings_empty(self):
+        # _HEADING_RE only matches ## — no ## present here
+        self.assertEqual(self.record.headings, [])
+
+    def test_fallback_used(self):
+        self.assertEqual(self.record.metadata["fallback_used"], "true")
+
+
+class TestMalformedHeadings(unittest.TestCase):
+    def setUp(self):
+        self.record = extract_cheatsheet_record(MALFORMED_MD, SOURCE_PATH)
+
+    def test_malformed_h1_extracted(self):
+        self.assertEqual(self.record.title, "Malformed Title")
+
+    def test_malformed_h2_in_headings(self):
+        self.assertIn("malformed", self.record.headings)
+
+    def test_valid_headings_also_extracted(self):
+        self.assertIn("Introduction", self.record.headings)
+        self.assertIn("Valid Heading", self.record.headings)
+
+    def test_summary_from_introduction(self):
+        self.assertIn("intro", self.record.summary.lower())
+
+    def test_fallback_not_used(self):
+        self.assertEqual(self.record.metadata["fallback_used"], "false")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 6b360fecc..f6e555207 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -8,9 +8,20 @@
 
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
-_TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
-_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE)
-_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE)
+_TITLE_RE = re.compile(
+    r"^\s*#(?!#)\s*(?P<title>.+?)$",
+    re.MULTILINE,
+)
+
+_HEADING_RE = re.compile(
+    r"^\s*##(?!#)\s*(?P<heading>.+?)$",
+    re.MULTILINE,
+)
+
+_ANY_HEADING_RE = re.compile(
+    r"^\s*#{1,6}(?!#)\s*.+?$",
+    re.MULTILINE,
+)
 
 
 def _derive_source_id(source_path: str) -> str:
@@ -44,7 +55,7 @@ def _extract_summary(markdown: str) -> str:
     """Extract summary from Introduction section in cheatsheet markdown."""
 
     for match in _ANY_HEADING_RE.finditer(markdown):
-        if match.group().lstrip("#").strip().lower() == "introduction":
+        if match.group().strip().lstrip("#").strip().lower() == "introduction":
             body = _extract_body_after_heading(markdown, match)
             if body:
                 return body

From 27a7f44851487fd312ea8fbbfb47860a7cdb53d2 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Sun, 14 Jun 2026 14:09:53 +0530
Subject: [PATCH 7/8] docs: add checkpoint B5 documentation and refine test
 comments

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 .../tests/cheatsheet_extractor_test.py        |  21 +-
 docs/rfc-structured-extraction.md             | 258 ++++++++++++++++++
 2 files changed, 269 insertions(+), 10 deletions(-)
 create mode 100644 docs/rfc-structured-extraction.md

diff --git a/application/tests/cheatsheet_extractor_test.py b/application/tests/cheatsheet_extractor_test.py
index 81ede71f3..ea159257f 100644
--- a/application/tests/cheatsheet_extractor_test.py
+++ b/application/tests/cheatsheet_extractor_test.py
@@ -30,15 +30,16 @@
 
 EMPTY_MD = ""
 
-# No ## headings — _extract_summary raises, _fallback_summary matches # via _ANY_HEADING_RE
-# and extracts body until len(markdown) since there is no next heading.
+# No ## Introduction heading exists, so summary extraction
+# falls back to the first available body content.
 BODY_UNDER_H1_MD = """\
 # Single Heading Cheat Sheet
 
 Body text directly under H1, no subheadings at all.
 """
 
-# Leading spaces before # and ##malformed (no space) — both handled by \s* and (?!#) regex
+# Leading whitespace before H1 and malformed ## headings
+# should still be normalized and extracted correctly.
 MALFORMED_MD = """\
    # Malformed Title
 
@@ -55,8 +56,8 @@ class TestNormal(unittest.TestCase):
     def setUp(self):
         self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH)
 
-    # source, source_id, hyperlink, raw_markdown_path are derived from SOURCE_PATH
-    # and are independent of markdown content — verified once here for all cases
+    # source-derived fields should remain deterministic and
+    # independent of markdown content across all extraction paths.
     def test_source(self):
         self.assertEqual(self.record.source, "owasp_cheatsheets")
 
@@ -76,8 +77,8 @@ def test_summary(self):
         self.assertEqual(self.record.summary, "Storage guidance.")
 
     def test_summary_bounded(self):
-        # SUMMARY_MAX_LENGTH truncation happens in CheatsheetRecord.__post_init__
-        # for every record — testing once here covers all cases
+        # Summary truncation is enforced centrally via
+        # CheatsheetRecord.__post_init__.
         self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH)
 
     def test_headings(self):
@@ -114,7 +115,7 @@ def test_title_is_fallback(self):
         self.assertEqual(self.record.title, "No title found.")
 
     def test_summary_no_summary_found(self):
-        # No headings at all — _fallback_summary returns this literal string
+        # Empty markdown should trigger terminal summary fallback.
         self.assertEqual(self.record.summary, "No summary found.")
 
     def test_headings_empty(self):
@@ -132,11 +133,11 @@ def test_title(self):
         self.assertEqual(self.record.title, "Single Heading Cheat Sheet")
 
     def test_summary_from_fallback_via_h1(self):
-        # _fallback_summary matches # heading, extracts body until len(markdown)
+        # Summary fallback should extract body content beneath the H1 section.
         self.assertIn("body text", self.record.summary.lower())
 
     def test_headings_empty(self):
-        # _HEADING_RE only matches ## — no ## present here
+        # No valid ## headings should produce an empty headings list.
         self.assertEqual(self.record.headings, [])
 
     def test_fallback_used(self):
diff --git a/docs/rfc-structured-extraction.md b/docs/rfc-structured-extraction.md
new file mode 100644
index 000000000..cb2913b34
--- /dev/null
+++ b/docs/rfc-structured-extraction.md
@@ -0,0 +1,258 @@
+# RFC Workstream B — Structured Extraction
+
+This document explains the implementation and behavior of RFC Workstream B
+(Structured Extraction) from Cheatsheet to CRE Mapping RFC.
+
+The goal of this module is to convert OWASP Cheat Sheet markdown into a
+deterministic structured object that downstream RFC workstreams can consume
+for categorization, retrieval, reranking, and mapping generation.
+
+The implementation is primarily located in:
+
+* `cheatsheet_defs.py`
+* `cheatsheet_extractor.py`
+
+---
+
+## Sources for more context
+
+* RFC:
+  `docs/rfc/cheatsheets-llm-autonomous-mapping-rfc.md`
+
+* Checkpoints B1 & B2 implementation PR:
+  `https://github.com/OWASP/OpenCRE/pull/912`
+
+* Checkpoints B3 & B4 implementation PR:
+  `https://github.com/OWASP/OpenCRE/pull/921`
+
+---
+
+## What Workstream B implements
+
+**The implementation strictly follows the RFC extraction contract and prioritizes deterministic extraction behavior.**
+
+It defines a typed dataclass named `CheatsheetRecord`.
+
+This object represents the structured extraction result returned from:
+
+```python
+extract_cheatsheet_record(markdown, source_path)
+```
+
+The extractor parses OWASP Cheat Sheet markdown and returns normalized
+structured information about a cheatsheet.
+
+`CheatsheetRecord` contains:
+
+* `source`
+* `source_id`
+* `title`
+* `hyperlink`
+* `summary`
+* `headings`
+* `raw_markdown_path`
+* `category_hints`
+* `metadata`
+
+---
+
+## Fallback behavior
+
+The extractor contains fallback functions capable of handling incomplete or
+malformed markdown containing:
+
+* missing titles,
+* missing summary sources,
+* malformed headings.
+
+These fallback paths ensure that extraction still returns a valid
+`CheatsheetRecord` object instead of failing entirely.
+
+Fallback behavior is explicitly surfaced through:
+
+```json
+"metadata": {
+  "fallback_used": "true"
+}
+```
+
+This allows downstream workstreams to identify records that required fallback
+logic during extraction and downstream normalization.
+
+---
+
+## Fallback decision tree
+
+```text
+extract_cheatsheet_record(markdown, source_path)
+
+│
+├── _extract_title(markdown)
+│     ├── H1 title exists
+│     │      → extract and normalize title
+│     │
+│     └── H1 title missing
+│            → _fallback_title()
+│            → "No title found."
+│            → metadata["fallback_used"] = "true"
+│
+└── _extract_summary(markdown)
+      ├── "Introduction" heading exists with body content
+      │      → body beneath "Introduction" extracted as summary
+      │      → summary normalized and truncated upto specifc length.
+      │
+      └── Introduction section missing or invalid
+             → _fallback_summary(markdown)
+             │
+             ├── first heading with body content exists
+             │      → its body returned as summary
+             │
+             └── no usable heading/body content exists
+                    → "No summary found."
+             → metadata["fallback_used"] = "true"
+```
+
+---
+
+## Extraction examples
+
+The following examples demonstrate deterministic extractor behavior across
+different markdown shapes.
+
+Notes:
+
+* Currently, `category_hints` is intentionally returned as an initial empty
+  list during v1.
+
+* `raw_markdown_path`, `hyperlink`, and `source_id` are derived from
+  `source_path` (Module A) and are independent of markdown content.
+
+---
+
+## 1. Normal cheat sheet
+
+### Example Input
+
+```markdown
+# Secrets Management Cheat Sheet
+
+## Introduction
+Storage guidance.
+
+## Architectural Patterns
+Use vaults and environment isolation.
+```
+
+### Output
+
+```json
+{
+  "source": "owasp_cheatsheets",
+  "source_id": "Secrets_Management_Cheat_Sheet",
+  "title": "Secrets Management Cheat Sheet",
+  "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html",
+  "summary": "Storage guidance.",
+  "headings": ["Introduction", "Architectural Patterns"],
+  "raw_markdown_path": "cheatsheets/Secrets_Management_Cheat_Sheet.md",
+  "category_hints": [],
+  "metadata": {
+    "parser_version": "v1",
+    "fallback_used": "false"
+  }
+}
+```
+
+### Notes
+
+* No fallback logic was required.
+
+---
+
+## 2. Missing H1 (fallback title)
+
+### Input
+
+```markdown
+## Introduction
+No H1 present.
+
+## Details
+More content.
+```
+
+### Output
+
+```json
+{
+  "source": "owasp_cheatsheets",
+  "source_id": "Example_Cheat_Sheet",
+  "title": "No title found.",
+  "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Example_Cheat_Sheet.html",
+  "summary": "No H1 present.",
+  "headings": ["Introduction", "Details"],
+  "raw_markdown_path": "cheatsheets/Example_Cheat_Sheet.md",
+  "category_hints": [],
+  "metadata": {
+    "parser_version": "v1",
+    "fallback_used": "true"
+  }
+}
+```
+
+### Notes
+
+* No H1 title exists, so the title defaults to `"No title found."`
+
+---
+
+## 3. Missing Introduction section (summary fallback)
+
+### Input
+
+```markdown
+# Single Heading Cheat Sheet
+
+## Authentication
+
+### Storage
+Secrets should be encrypted.
+```
+
+### Output
+
+```json
+{
+  "source": "owasp_cheatsheets",
+  "source_id": "Single_Heading_Cheat_Sheet",
+  "title": "Single Heading Cheat Sheet",
+  "hyperlink": "https://cheatsheetseries.owasp.org/cheatsheets/Single_Heading_Cheat_Sheet.html",
+  "summary": "Secrets should be encrypted.",
+  "headings": ["Authentication"],
+  "raw_markdown_path": "cheatsheets/Single_Heading_Cheat_Sheet.md",
+  "category_hints": [],
+  "metadata": {
+    "parser_version": "v1",
+    "fallback_used": "true"
+  }
+}
+```
+
+### Notes
+
+* No `Introduction` heading exists, so summary fallback logic is used.
+* The fallback scans all headings and returns the first non-empty body it
+  finds — in this case the content beneath `### Storage`.
+* Only `##`-level headings appear in `headings` — `### Storage` is excluded.
+
+---
+
+## Additional behavior notes
+
+It is ensured that markdown files with malformed title/headings such as:
+
+* With leading whitespace (e.g. `   # My Title`, `   ## My Heading`)
+* No space after the marker (e.g. `##Authentication`)
+
+are extracted correctly.
+
+---
\ No newline at end of file

From 8a3ac8feb0a90363472ffcd9177e68357ff2da5c Mon Sep 17 00:00:00 2001
From: Abhijeet <abhijeetsaharan2236@gmail.com>
Date: Sun, 14 Jun 2026 14:49:05 +0530
Subject: [PATCH 8/8] docs: refine malformed heading behavior notes

Signed-off-by: Abhijeet <abhijeetsaharan2236@gmail.com>
---
 docs/rfc-structured-extraction.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/rfc-structured-extraction.md b/docs/rfc-structured-extraction.md
index cb2913b34..06296f5ec 100644
--- a/docs/rfc-structured-extraction.md
+++ b/docs/rfc-structured-extraction.md
@@ -248,11 +248,9 @@ Secrets should be encrypted.
 
 ## Additional behavior notes
 
-It is ensured that markdown files with malformed title/headings such as:
+The extractor correctly handles markdown files with malformed titles/headings such as:
 
-* With leading whitespace (e.g. `   # My Title`, `   ## My Heading`)
+* Titles with leading whitespace
 * No space after the marker (e.g. `##Authentication`)
 
-are extracted correctly.
-
----
\ No newline at end of file
+---