codee-com
diff --git a/‎scripts/analyze_codee_reports.py‎
Lines changed: 389 additions & 0 deletions b/‎scripts/analyze_codee_reports.py‎
Lines changed: 389 additions & 0 deletions
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""Analyze Codee JSON or HTML reports over time and generate time-series visualizations."""
+
+import argparse
+import json
+import logging
+import re
+import sys
+import pandas as pd
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+
+
+def setup_logging() -> logging.Logger:
+    """Configure logging for the script."""
+    logger = logging.getLogger("codee_analyzer")
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler(sys.stderr)
+    handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+    logger.addHandler(handler)
+    return logger
+
+
+def load_json_file(file_path: Path, logger: logging.Logger) -> dict | None:
+    """Load and parse a JSON file, returning None on failure."""
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError as e:
+        logger.warning(f"Malformed JSON in {file_path.name}: {e}")
+        return None
+    except OSError as e:
+        logger.warning(f"Failed to read {file_path.name}: {e}")
+        return None
+
+
+def load_html_report(report_dir: Path, logger: logging.Logger) -> dict | None:
+    """Load a Codee HTML report by parsing report.js file."""
+    report_js_path = report_dir / "report.js"
+    if not report_js_path.exists():
+        logger.warning(f"report.js not found in {report_dir.name}")
+        return None
+
+    try:
+        content = report_js_path.read_text(encoding="utf-8")
+        match = re.search(r"const\s+report\s*=\s*(\{.*\});", content, re.DOTALL)
+        if not match:
+            match = re.search(r"var\s+report\s*=\s*(\{.*\});", content, re.DOTALL)
+
+        if not match:
+            logger.warning(f"Could not find report object in {report_js_path}")
+            return None
+
+        json_str = match.group(1)
+        return json.loads(json_str)
+    except Exception as e:
+        logger.warning(f"Failed to parse report.js in {report_dir.name}: {e}")
+        return None
+
+
+def extract_timestamp(report: dict, logger: logging.Logger) -> int | None:
+    """Extract timestamp from report."""
+    try:
+        exec_info = report.get("CodeeExecutionInfo", {})
+        ts_utc = exec_info.get("TimestampUTC")
+        if ts_utc:
+            dt = datetime.fromisoformat(ts_utc.replace("Z", "+00:00"))
+            return int(dt.timestamp())
+        return exec_info.get("TimestampEpochSeconds")
+    except Exception as e:
+        logger.warning(f"Failed to extract timestamp: {e}")
+        return None
+
+
+def extract_checker_counts(report: dict, logger: logging.Logger) -> dict[str, int]:
+    """Extract checker counts from Quality and Optimization rankings."""
+    counts: dict[str, int] = {}
+
+    try:
+        screening = report.get("Screening", {})
+
+        quality_table = screening.get("Ranking of Quality Checkers", {}).get(
+            "DataTable", []
+        )
+        for row in quality_table:
+            checker = row.get("Checker", "")
+            if checker and checker != "Total":
+                try:
+                    counts[checker] = int(row.get("#", "0"))
+                except ValueError:
+                    logger.warning(f"Invalid count for checker {checker}")
+
+        opt_table = screening.get("Ranking of Optimization Checkers", {}).get(
+            "DataTable", []
+        )
+        for row in opt_table:
+            checker = row.get("Checker", "")
+            if checker and checker != "Total":
+                try:
+                    counts[checker] = int(row.get("#", "0"))
+                except ValueError:
+                    logger.warning(f"Invalid count for checker {checker}")
+
+    except Exception as e:
+        logger.warning(f"Failed to extract checker counts: {e}")
+
+    return counts
+
+
+def extract_checker_priorities(report: dict, logger: logging.Logger) -> dict[str, str]:
+    """Extract priority for each checker."""
+    priorities: dict[str, str] = {}
+
+    try:
+        screening = report.get("Screening", {})
+
+        quality_table = screening.get("Ranking of Quality Checkers", {}).get(
+            "DataTable", []
+        )
+        for row in quality_table:
+            checker = row.get("Checker", "")
+            if checker and checker != "Total":
+                priority = row.get("Priority", "")
+                if priority:
+                    priorities[checker] = priority
+
+        opt_table = screening.get("Ranking of Optimization Checkers", {}).get(
+            "DataTable", []
+        )
+        for row in opt_table:
+            checker = row.get("Checker", "")
+            if checker and checker != "Total":
+                priority = row.get("Priority", "")
+                if priority:
+                    priorities[checker] = priority
+
+    except Exception as e:
+        logger.warning(f"Failed to extract checker priorities: {e}")
+
+    return priorities
+
+
+def extract_l_level(priority_str: str) -> str:
+    """Extract L-level from priority string like 'P18 (L1)'."""
+    if not priority_str:
+        return "Unknown"
+    if "L1" in priority_str:
+        return "L1"
+    if "L2" in priority_str:
+        return "L2"
+    if "L3" in priority_str:
+        return "L3"
+    if "L4" in priority_str:
+        return "L4"
+    return "Unknown"
+
+
+def detect_input_type(input_dir: Path) -> str:
+    """Detect if input is JSON or HTML format by searching recursively."""
+    has_json = list(input_dir.rglob("*.json"))
+    has_html_dirs = list(input_dir.rglob("report.js"))
+
+    if has_html_dirs:
+        return "html"
+    elif has_json:
+        return "json"
+    return "unknown"
+
+
+def load_reports(
+    input_dir: Path, logger: logging.Logger
+) -> list[tuple[int, dict, Path | None]]:
+    """Load all reports from directory recursively, sorted by timestamp.
+
+    Returns list of tuples: (timestamp, report_dict, path)
+    path is the path to the report file (for linking)
+    """
+    reports: list[tuple[int, dict, Path | None]] = []
+    input_type = detect_input_type(input_dir)
+
+    if input_type == "html":
+        logger.info("Detected HTML format (recursive search for report.js)")
+        html_dirs = sorted(input_dir.rglob("report.js"))
+        for report_js in html_dirs:
+            report_dir = report_js.parent
+            report = load_html_report(report_dir, logger)
+            if report is None:
+                continue
+
+            timestamp = extract_timestamp(report, logger)
+            if timestamp is None:
+                logger.warning(f"Skipping {report_dir.name}: no valid timestamp")
+                continue
+
+            reports.append((timestamp, report, report_dir))
+
+    elif input_type == "json":
+        logger.info("Detected JSON format (recursive search for *.json)")
+        json_files = sorted(input_dir.rglob("*.json"))
+        if not json_files:
+            logger.warning(f"No JSON files found in {input_dir}")
+            return reports
+
+        for file_path in json_files:
+            report = load_json_file(file_path, logger)
+            if report is None:
+                continue
+
+            timestamp = extract_timestamp(report, logger)
+            if timestamp is None:
+                logger.warning(f"Skipping {file_path.name}: no valid timestamp")
+                continue
+
+            reports.append((timestamp, report, file_path))
+
+    else:
+        logger.error(f"No valid reports found in {input_dir}")
+        return reports
+
+    reports.sort(key=lambda x: x[0])
+    return reports
+
+
+def build_checker_dataframe(
+    reports: list[tuple[int, dict, Path | None]], logger: logging.Logger
+) -> pd.DataFrame:
+    """Build a DataFrame with timestamps as index and checker counts as columns."""
+    all_checkers: set[str] = set()
+
+    for _, report, _ in reports:
+        counts = extract_checker_counts(report, logger)
+        all_checkers.update(counts.keys())
+
+    sorted_checkers = sorted(all_checkers)
+    data: dict[str, list[int]] = {checker: [] for checker in sorted_checkers}
+    timestamps: list[int] = []
+
+    for timestamp, report, _ in reports:
+        timestamps.append(timestamp)
+        counts = extract_checker_counts(report, logger)
+        for checker in sorted_checkers:
+            data[checker].append(counts.get(checker, 0))
+
+    df = pd.DataFrame(data, index=pd.to_datetime(timestamps, unit="s"))
+    df.index.name = "timestamp"
+    return df
+
+
+def get_checker_priorities_for_df(
+    df: pd.DataFrame,
+    reports: list[tuple[int, dict, Path | None]],
+    logger: logging.Logger,
+) -> dict[str, str]:
+    """Get priority mapping for all checkers in the DataFrame."""
+    priorities: dict[str, str] = {}
+
+    for _, report, _ in reports:
+        report_priorities = extract_checker_priorities(report, logger)
+        priorities.update(report_priorities)
+
+    return priorities
+
+
+def generate_html_report(
+    df: pd.DataFrame,
+    reports: list[tuple[int, dict, Path | None]],
+    input_dir: Path,
+    output_dir: Path,
+    logger: logging.Logger,
+) -> None:
+    """Generate interactive HTML report with Chart.js."""
+    logger.info("Generating interactive HTML report")
+
+    priorities = get_checker_priorities_for_df(df, reports, logger)
+    labels = [d.strftime("%Y-%m-%d") for d in df.index]
+    total_data = df.sum(axis=1).tolist()
+
+    checker_data: dict[str, list[int]] = {}
+    for checker in df.columns:
+        checker_data[checker] = df[checker].tolist()
+
+    priority_order = ["L1", "L2", "L3", "L4", "Unknown"]
+    priority_colors = {
+        "L1": "#DC143C",
+        "L2": "#FF8C00",
+        "L3": "#228B22",
+        "L4": "#90EE90",
+        "Unknown": "#D3D3D3",
+    }
+
+    l_level_data: dict[str, list[int]] = {l: [] for l in priority_order}
+    for idx in df.index:
+        row = df.loc[idx]
+        p_groups: dict[str, int] = {l: 0 for l in priority_order}
+        for checker, count in row.items():
+            l_level = extract_l_level(priorities.get(checker, ""))
+            p_groups[l_level] += count
+        for l_level in priority_order:
+            l_level_data[l_level].append(p_groups[l_level])
+
+    # Build links to original reports (JSON or HTML)
+    report_links: list[dict | None] = []
+    for _, _, report_path in reports:
+        if report_path:
+            abs_path = report_path.resolve()
+            if report_path.suffix == ".json":
+                report_links.append({"link": str(abs_path), "type": "json"})
+            else:
+                index_html = abs_path / "index.html"
+                if index_html.exists():
+                    report_links.append({"link": str(index_html), "type": "html"})
+                else:
+                    report_links.append({"link": str(abs_path), "type": "html"})
+        else:
+            report_links.append(None)
+
+    chart_data = {
+        "labels": labels,
+        "total": total_data,
+        "checkers": checker_data,
+        "priorities": l_level_data,
+        "priorityColors": priority_colors,
+        "reportLinks": report_links,
+        "hasLinks": any(link is not None for link in report_links),
+    }
+
+    template_path = Path(__file__).parent / "templates" / "codee_report.html"
+    html_template = template_path.read_text(encoding="utf-8")
+
+    replacements = {
+        "${REPORT_COUNT}": str(len(reports)),
+        "${CHECKER_COUNT}": str(len(checker_data)),
+        "${TOTAL_FINDINGS}": str(int(total_data[-1])),
+        "${INPUT_DIR}": str(input_dir),
+        "${CHART_DATA}": json.dumps(chart_data, indent=2),
+    }
+
+    for placeholder, value in replacements.items():
+        html_template = html_template.replace(placeholder, value)
+
+    html_path = output_dir / "codee_analysis.html"
+    html_path.write_text(html_template, encoding="utf-8")
+    logger.info(f"HTML report saved to {html_path}")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Analyze Codee JSON or HTML reports and generate HTML visualizations."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=Path,
+        help="Directory containing Codee report files (JSON or HTML with report.js)",
+    )
+    parser.add_argument(
+        "output_dir",
+        type=Path,
+        help="Directory where HTML report will be saved",
+    )
+    args = parser.parse_args()
+
+    logger = setup_logging()
+
+    if not args.input_dir.is_dir():
+        logger.error(f"Input directory does not exist: {args.input_dir}")
+        return 1
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    logger.info(f"Loading reports from {args.input_dir}")
+    reports = load_reports(args.input_dir, logger)
+
+    if not reports:
+        logger.warning("No valid reports loaded. Exiting.")
+        return 0
+
+    logger.info(f"Loaded {len(reports)} reports")
+
+    logger.info("Building checker DataFrame")
+    df = build_checker_dataframe(reports, logger)
+
+    generate_html_report(df, reports, args.input_dir, args.output_dir, logger)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())