Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions application/tests/cheatsheet_extractor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import unittest
from application.utils.external_project_parsers.parsers.cheatsheet_extractor import (
extract_cheatsheet_record,
)
from application.defs.cheatsheet_defs import SUMMARY_MAX_LENGTH

SOURCE_PATH = "cheatsheets/Secrets_Management_Cheat_Sheet.md"
EXPECTED_SOURCE_ID = "Secrets_Management_Cheat_Sheet"
EXPECTED_HYPERLINK = (
"https://cheatsheetseries.owasp.org/cheatsheets/Secrets_Management_Cheat_Sheet.html"
)

NORMAL_MD = """\
# Secrets Management Cheat Sheet

## Introduction
Storage guidance.

## Architectural Patterns
Use vaults and environment isolation.
"""

MISSING_H1_MD = """\
## Introduction
No H1 present.

## Details
More content.
"""

EMPTY_MD = ""

# No ## Introduction heading exists, so summary extraction
# falls back to the first available body content.
BODY_UNDER_H1_MD = """\
# Single Heading Cheat Sheet

Body text directly under H1, no subheadings at all.
"""

# Leading whitespace before H1 and malformed ## headings
# should still be normalized and extracted correctly.
MALFORMED_MD = """\
# Malformed Title

##malformed

## Introduction
Some intro text.

## Valid Heading
"""


class TestNormal(unittest.TestCase):
def setUp(self):
self.record = extract_cheatsheet_record(NORMAL_MD, SOURCE_PATH)

# source-derived fields should remain deterministic and
# independent of markdown content across all extraction paths.
def test_source(self):
self.assertEqual(self.record.source, "owasp_cheatsheets")

def test_source_id(self):
self.assertEqual(self.record.source_id, EXPECTED_SOURCE_ID)

def test_hyperlink(self):
self.assertEqual(self.record.hyperlink, EXPECTED_HYPERLINK)

def test_raw_markdown_path(self):
self.assertEqual(self.record.raw_markdown_path, SOURCE_PATH)

def test_title(self):
self.assertEqual(self.record.title, "Secrets Management Cheat Sheet")

def test_summary(self):
self.assertEqual(self.record.summary, "Storage guidance.")

def test_summary_bounded(self):
# Summary truncation is enforced centrally via
# CheatsheetRecord.__post_init__.
self.assertLessEqual(len(self.record.summary), SUMMARY_MAX_LENGTH)

def test_headings(self):
self.assertIn("Introduction", self.record.headings)
self.assertIn("Architectural Patterns", self.record.headings)

def test_fallback_not_used(self):
self.assertEqual(self.record.metadata["fallback_used"], "false")


class TestMissingH1(unittest.TestCase):
def setUp(self):
self.record = extract_cheatsheet_record(MISSING_H1_MD, SOURCE_PATH)

def test_title_is_fallback(self):
self.assertEqual(self.record.title, "No title found.")

def test_summary_from_introduction(self):
self.assertIn("no h1", self.record.summary.lower())

def test_headings_extracted(self):
self.assertIn("Introduction", self.record.headings)
self.assertIn("Details", self.record.headings)

def test_fallback_used(self):
self.assertEqual(self.record.metadata["fallback_used"], "true")


class TestEmptyMarkdown(unittest.TestCase):
def setUp(self):
self.record = extract_cheatsheet_record(EMPTY_MD, SOURCE_PATH)

def test_title_is_fallback(self):
self.assertEqual(self.record.title, "No title found.")

def test_summary_no_summary_found(self):
# Empty markdown should trigger terminal summary fallback.
self.assertEqual(self.record.summary, "No summary found.")

def test_headings_empty(self):
self.assertEqual(self.record.headings, [])

def test_fallback_used(self):
self.assertEqual(self.record.metadata["fallback_used"], "true")


class TestBodyUnderH1(unittest.TestCase):
def setUp(self):
self.record = extract_cheatsheet_record(BODY_UNDER_H1_MD, SOURCE_PATH)

def test_title(self):
self.assertEqual(self.record.title, "Single Heading Cheat Sheet")

def test_summary_from_fallback_via_h1(self):
# Summary fallback should extract body content beneath the H1 section.
self.assertIn("body text", self.record.summary.lower())

def test_headings_empty(self):
# No valid ## headings should produce an empty headings list.
self.assertEqual(self.record.headings, [])

def test_fallback_used(self):
self.assertEqual(self.record.metadata["fallback_used"], "true")


class TestMalformedHeadings(unittest.TestCase):
def setUp(self):
self.record = extract_cheatsheet_record(MALFORMED_MD, SOURCE_PATH)

def test_malformed_h1_extracted(self):
self.assertEqual(self.record.title, "Malformed Title")

def test_malformed_h2_in_headings(self):
self.assertIn("malformed", self.record.headings)

def test_valid_headings_also_extracted(self):
self.assertIn("Introduction", self.record.headings)
self.assertIn("Valid Heading", self.record.headings)

def test_summary_from_introduction(self):
self.assertIn("intro", self.record.summary.lower())

def test_fallback_not_used(self):
self.assertEqual(self.record.metadata["fallback_used"], "false")


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
import logging
import os
import re

from application.defs.cheatsheet_defs import CheatsheetRecord

PARSER_VERSION = "v1"
FALLBACK_USED = "false"

CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"

_TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
_HEADING_RE = re.compile(r"^##\s+(?P<heading>.+)$", re.MULTILINE)
_ANY_HEADING_RE = re.compile(r"^#{1,6}\s+.+$", re.MULTILINE)
_TITLE_RE = re.compile(
r"^\s*#(?!#)\s*(?P<title>.+?)$",
re.MULTILINE,
)

_HEADING_RE = re.compile(
r"^\s*##(?!#)\s*(?P<heading>.+?)$",
re.MULTILINE,
)

_ANY_HEADING_RE = re.compile(
r"^\s*#{1,6}(?!#)\s*.+?$",
re.MULTILINE,
)


def _derive_source_id(source_path: str) -> str:
Expand Down Expand Up @@ -41,28 +52,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:


def _extract_summary(markdown: str) -> str:
"""Extract a summary section from cheatsheet markdown."""

all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))
"""Extract summary from Introduction section in cheatsheet markdown."""

for match in all_heading_matches:
heading_text = match.group().lstrip("#").strip()

if heading_text.lower() == "introduction":
for match in _ANY_HEADING_RE.finditer(markdown):
if match.group().strip().lstrip("#").strip().lower() == "introduction":
body = _extract_body_after_heading(markdown, match)

if body:
return body

break
raise ValueError(
"_extract_summary: no suitable summary section could be extracted from markdown."
)

for match in all_heading_matches:
body = _extract_body_after_heading(markdown, match)

def _extract_title(markdown: str) -> str:
"""Extract H1 title from cheatsheet markdown."""

match = _TITLE_RE.search(markdown)
if not match:
raise ValueError("_extract_title: no title found in markdown.")

return match.group("title").strip()


def _fallback_title() -> str:
"""Return fallback title for malformed markdown."""

return "No title found."


def _fallback_summary(markdown: str) -> str:
"""Return first non-empty paragraph after any heading, or 'No summary found.'"""

for match in _ANY_HEADING_RE.finditer(markdown):
body = _extract_body_after_heading(markdown, match)
if body:
return body

raise ValueError("_extract_summary: no summary could be extracted from markdown.")
return "No summary found."


def extract_cheatsheet_record(
Expand All @@ -71,12 +98,24 @@ def extract_cheatsheet_record(
) -> CheatsheetRecord:
"""Extract a structured CheatsheetRecord from markdown content."""

title_match = _TITLE_RE.search(markdown)
title = title_match.group("title").strip()
fallback_used = "false"

try:
title = _extract_title(markdown)
except ValueError as e:
logging.warning(str(e))
title = _fallback_title()
fallback_used = "true"

# Headings can be empty.
headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]

summary = _extract_summary(markdown)
try:
summary = _extract_summary(markdown)
except ValueError as e:
logging.warning(str(e))
summary = _fallback_summary(markdown)
fallback_used = "true"

source_id = _derive_source_id(source_path)
hyperlink = _derive_hyperlink(source_path)
Expand All @@ -91,6 +130,6 @@ def extract_cheatsheet_record(
category_hints=[],
metadata={
"parser_version": PARSER_VERSION,
"fallback_used": FALLBACK_USED,
"fallback_used": fallback_used,
},
)
Loading
Loading