sql-tutorial/compile_exercises.py at main · manijang2/sql-tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#!/usr/bin/env python3
"""Compile exercise YAML files into mkdocs markdown and exercise.db.

Usage:
    # Compile all exercises
    python compile_exercises.py

    # Compile and generate expected results from tutorial DB
    python compile_exercises.py --tutorial-db output/ecommerce.db

    # Validate only (no output)
    python compile_exercises.py --validate-only

    # Compile single file
    python compile_exercises.py --file exercises/beginner/01-select.yaml
"""

import argparse
import hashlib
import json
import os
import sqlite3
import sys
from pathlib import Path

import yaml


EXERCISES_DIR = Path("exercises")
DOCS_KO_DIR = Path("docs/ko/exercises")
DOCS_EN_DIR = Path("docs/en/exercises")
OUTPUT_DB = Path("output/exercise.db")


def load_yaml(path: Path) -> dict:
    with open(path, encoding="utf-8") as f:
        return yaml.safe_load(f)


def create_exercise_db(db_path: Path):
    """Create exercise.db schema."""
    os.makedirs(db_path.parent, exist_ok=True)
    if db_path.exists():
        db_path.unlink()

    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
        CREATE TABLE exercise_sets (
            id              TEXT PRIMARY KEY,
            title           TEXT NOT NULL,
            title_en        TEXT,
            difficulty      TEXT NOT NULL,
            concepts        TEXT NOT NULL,
            prerequisites   TEXT,
            estimated_minutes INTEGER,
            sort_order      INTEGER NOT NULL,
            created_at      TEXT NOT NULL DEFAULT (datetime('now'))
        );

        CREATE TABLE problems (
            id              TEXT PRIMARY KEY,
            exercise_id     TEXT NOT NULL REFERENCES exercise_sets(id),
            question        TEXT NOT NULL,
            question_en     TEXT,
            level           INTEGER DEFAULT 3,
            type            TEXT DEFAULT 'SELECT',
            reference_sql_common TEXT,
            reference_sql_sqlite TEXT,
            reference_sql_mysql TEXT,
            reference_sql_postgresql TEXT,
            supported_db    TEXT NOT NULL DEFAULT '["sqlite","mysql","postgresql"]',
            validation_json TEXT NOT NULL,
            hints_json      TEXT,
            rubric          TEXT,
            rubric_en       TEXT,
            max_score       INTEGER DEFAULT 10,
            tags_json       TEXT,
            sort_order      INTEGER NOT NULL,
            expected_columns TEXT,
            expected_row_count INTEGER,
            expected_hash   TEXT
        );

        CREATE TABLE exercise_tags (
            tag             TEXT PRIMARY KEY,
            category        TEXT NOT NULL
        );

        CREATE TABLE problem_tags (
            problem_id      TEXT NOT NULL REFERENCES problems(id),
            tag             TEXT NOT NULL,
            PRIMARY KEY (problem_id, tag)
        );

        CREATE TABLE attempts (
            id              INTEGER PRIMARY KEY AUTOINCREMENT,
            problem_id      TEXT NOT NULL REFERENCES problems(id),
            user_sql        TEXT NOT NULL,
            syntax_valid    INTEGER NOT NULL,
            columns_match   INTEGER NOT NULL,
            row_count_match INTEGER NOT NULL,
            data_match      INTEGER NOT NULL,
            result_hash     TEXT,
            det_score       INTEGER NOT NULL,
            ai_score        INTEGER,
            ai_feedback     TEXT,
            total_score     INTEGER NOT NULL,
            execution_ms    INTEGER,
            row_count       INTEGER,
            attempted_at    TEXT NOT NULL DEFAULT (datetime('now'))
        );

        CREATE TABLE progress (
            problem_id      TEXT PRIMARY KEY REFERENCES problems(id),
            best_score      INTEGER NOT NULL DEFAULT 0,
            attempt_count   INTEGER NOT NULL DEFAULT 0,
            completed       INTEGER NOT NULL DEFAULT 0,
            first_solved_at TEXT,
            last_attempt_at TEXT NOT NULL DEFAULT (datetime('now'))
        );

        CREATE TABLE badges (
            id              TEXT PRIMARY KEY,
            name            TEXT NOT NULL,
            name_en         TEXT,
            description     TEXT NOT NULL,
            description_en  TEXT,
            icon            TEXT,
            condition_sql   TEXT NOT NULL,
            earned_at       TEXT
        );

        CREATE INDEX idx_problems_exercise_id ON problems(exercise_id);
        CREATE INDEX idx_attempts_problem_id ON attempts(problem_id);
    """)
    return conn


def compute_expected(conn_tutorial, sql: str) -> tuple:
    """Execute reference SQL and compute expected results."""
    try:
        cursor = conn_tutorial.execute(sql)
        columns = [desc[0] for desc in cursor.description] if cursor.description else []
        rows = cursor.fetchall()
        row_count = len(rows)

        # Hash for result comparison (sorted, stringified)
        sorted_rows = sorted(str(r) for r in rows)
        result_hash = hashlib.sha256("\n".join(sorted_rows).encode()).hexdigest()

        return json.dumps(columns), row_count, result_hash
    except Exception as e:
        print(f"    WARNING: SQL execution failed: {e}")
        return None, None, None


def compile_yaml_file(yaml_path: Path, conn_db, conn_tutorial, sort_base: int) -> dict:
    """Compile a single YAML file into exercise.db + mkdocs markdown."""
    data = load_yaml(yaml_path)
    meta = data.get("metadata", {})

    exercise_id = meta["id"]
    print(f"  [{exercise_id}] {meta.get('title', '')} ({len(data.get('problems', []))} problems)")

    # Insert exercise set
    conn_db.execute(
        "INSERT INTO exercise_sets (id, title, title_en, difficulty, concepts, prerequisites, estimated_minutes, sort_order) VALUES (?,?,?,?,?,?,?,?)",
        (
            exercise_id,
            meta.get("title", ""),
            meta.get("title_en", ""),
            meta.get("difficulty", "beginner"),
            json.dumps(meta.get("concepts", []), ensure_ascii=False),
            json.dumps(meta.get("prerequisites", []), ensure_ascii=False),
            meta.get("estimated_minutes"),
            sort_base,
        ),
    )

    # Build mkdocs markdown (ko + en)
    md_ko_lines = [f"# {meta.get('title', exercise_id)}\n"]
    md_en_lines = [f"# {meta.get('title_en', exercise_id)}\n"]

    desc_ko = meta.get("description", "")
    desc_en = meta.get("description_en", "")
    if desc_ko:
        md_ko_lines.append(f"{desc_ko}\n\n---\n")
    if desc_en:
        md_en_lines.append(f"{desc_en}\n\n---\n")

    problems = data.get("problems", [])
    for i, prob in enumerate(problems):
        pid = prob["id"]
        sort_order = sort_base * 100 + i + 1

        # Resolve reference SQL
        ref_sql = prob.get("reference_sql", {})
        if isinstance(ref_sql, str):
            ref_common = ref_sql
            ref_sqlite = ref_mysql = ref_pg = None
        else:
            ref_common = ref_sql.get("common") or ref_sql.get("all")
            ref_sqlite = ref_sql.get("sqlite")
            ref_mysql = ref_sql.get("mysql")
            ref_pg = ref_sql.get("postgresql")

        supported = prob.get("supported_db", ["sqlite", "mysql", "postgresql"])

        # Compute expected results
        exec_sql = ref_sqlite or ref_common
        exp_cols, exp_rows, exp_hash = None, None, None
        if conn_tutorial and exec_sql:
            exp_cols, exp_rows, exp_hash = compute_expected(conn_tutorial, exec_sql.strip())

        # Hints (support both "hints" array and single "hint"/"hint_en" strings)
        hints = prob.get("hints", [])
        if not hints:
            hint_ko = prob.get("hint", "")
            hint_en = prob.get("hint_en", "")
            if hint_ko:
                hints = [{"ko": hint_ko, "en": hint_en or hint_ko}]
        hints_json = json.dumps(hints, ensure_ascii=False) if hints else None

        # Validation
        validation = prob.get("validation", {"type": "result_match"})

        # Resolve level and type
        prob_level = prob.get("level", meta.get("level", 3))
        prob_type = prob.get("type", meta.get("type", "SELECT"))
        prob_tags = prob.get("tags", [])

        # Insert problem
        conn_db.execute(
            """INSERT INTO problems (id, exercise_id, question, question_en,
               level, type,
               reference_sql_common, reference_sql_sqlite, reference_sql_mysql, reference_sql_postgresql,
               supported_db, validation_json, hints_json, rubric, rubric_en,
               max_score, tags_json, sort_order, expected_columns, expected_row_count, expected_hash)
               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
            (
                pid, exercise_id,
                prob.get("question", "") or prob.get("body", ""),
                prob.get("question_en", "") or prob.get("body_en", ""),
                prob_level, prob_type,
                ref_common, ref_sqlite, ref_mysql, ref_pg,
                json.dumps(supported),
                json.dumps(validation, ensure_ascii=False),
                hints_json,
                _to_str(prob.get("rubric", "")),
                _to_str(prob.get("rubric_en", "")),
                prob.get("max_score", 10),
                json.dumps(prob_tags, ensure_ascii=False),
                sort_order,
                exp_cols, exp_rows, exp_hash,
            ),
        )

        # Insert problem-tag mappings
        for tag in prob_tags:
            conn_db.execute(
                "INSERT OR IGNORE INTO problem_tags (problem_id, tag) VALUES (?,?)",
                (pid, tag),
            )

        # Generate markdown
        num = i + 1
        # Support both "question" and "body" (with optional "title" prefix)
        body_ko = prob.get("question", "") or prob.get("body", "")
        body_en = prob.get("question_en", "") or prob.get("body_en", "") or body_ko
        title_ko = prob.get("title", "")
        title_en = prob.get("title_en", title_ko)

        # Heading: use title if available, otherwise first line of body
        heading_ko = title_ko or body_ko.strip().split("\n")[0][:60]
        heading_en = title_en or body_en.strip().split("\n")[0][:60]

        md_ko_lines.append(f"\n### {num}. {heading_ko}\n")
        md_ko_lines.append(f"\n{body_ko.strip()}\n")
        md_en_lines.append(f"\n### {num}. {heading_en}\n")
        md_en_lines.append(f"\n{body_en.strip()}\n")

        # Hints
        for hi, hint in enumerate(hints):
            if isinstance(hint, dict):
                hint_ko = hint.get("ko", "")
                hint_en = hint.get("en", hint_ko)
            else:
                hint_ko = hint_en = str(hint)
            md_ko_lines.append(f"\n**힌트 {hi+1}:** {hint_ko}\n")
            md_en_lines.append(f"\n**Hint {hi+1}:** {hint_en}\n")

        # Answer (collapsible)
        answer_sql = ref_common or ref_sqlite or ""
        if ref_sqlite and ref_mysql and ref_pg:
            # Multi-DB tabs
            md_ko_lines.append('\n??? success "정답"\n')
            md_ko_lines.append(f'    === "SQLite"\n        ```sql\n        {_indent(ref_sqlite)}\n        ```\n')
            if ref_mysql:
                md_ko_lines.append(f'    === "MySQL"\n        ```sql\n        {_indent(ref_mysql)}\n        ```\n')
            if ref_pg:
                md_ko_lines.append(f'    === "PostgreSQL"\n        ```sql\n        {_indent(ref_pg)}\n        ```\n')

            md_en_lines.append('\n??? success "Answer"\n')
            md_en_lines.append(f'    === "SQLite"\n        ```sql\n        {_indent(ref_sqlite)}\n        ```\n')
            if ref_mysql:
                md_en_lines.append(f'    === "MySQL"\n        ```sql\n        {_indent(ref_mysql)}\n        ```\n')
            if ref_pg:
                md_en_lines.append(f'    === "PostgreSQL"\n        ```sql\n        {_indent(ref_pg)}\n        ```\n')
        else:
            md_ko_lines.append(f'\n??? success "정답"\n    ```sql\n    {_indent(answer_sql)}\n    ```\n')
            md_en_lines.append(f'\n??? success "Answer"\n    ```sql\n    {_indent(answer_sql)}\n    ```\n')

        md_ko_lines.append("\n---\n")
        md_en_lines.append("\n---\n")

    return {
        "exercise_id": exercise_id,
        "md_ko": "\n".join(md_ko_lines),
        "md_en": "\n".join(md_en_lines),
        "problem_count": len(problems),
    }


def _to_str(val) -> str:
    """Convert value to string; dict/list become JSON."""
    if val is None:
        return ""
    if isinstance(val, (dict, list)):
        return json.dumps(val, ensure_ascii=False)
    return str(val)


def _indent(sql: str, prefix: str = "    ") -> str:
    """Indent multi-line SQL for markdown code blocks."""
    lines = sql.strip().split("\n")
    return f"\n{prefix}".join(lines)


def main():
    parser = argparse.ArgumentParser(description="Compile exercise YAML to mkdocs + exercise.db")
    parser.add_argument("--tutorial-db", type=str, default="output/ecommerce.db",
                        help="Tutorial DB for computing expected results")
    parser.add_argument("--output-db", type=str, default=str(OUTPUT_DB),
                        help="Output exercise.db path")
    parser.add_argument("--validate-only", action="store_true", help="Validate only, no output")
    parser.add_argument("--file", type=str, help="Compile a single YAML file")
    args = parser.parse_args()

    # Find YAML files
    if args.file:
        yaml_files = [Path(args.file)]
    else:
        yaml_files = sorted(EXERCISES_DIR.rglob("*.yaml"))

    if not yaml_files:
        print("No YAML exercise files found in exercises/")
        return

    print(f"Found {len(yaml_files)} exercise files")

    # Connect to tutorial DB for expected result computation
    conn_tutorial = None
    if os.path.exists(args.tutorial_db):
        conn_tutorial = sqlite3.connect(args.tutorial_db)
        print(f"Using tutorial DB: {args.tutorial_db}")

    if args.validate_only:
        # Just parse and validate
        for yf in yaml_files:
            try:
                data = load_yaml(yf)
                meta = data.get("metadata", {})
                problems = data.get("problems", [])
                print(f"  OK  {yf} -{meta.get('id', '?')}: {len(problems)} problems")
            except Exception as e:
                print(f"  ERR {yf} -{e}")
        return

    # Create exercise.db
    conn_db = create_exercise_db(Path(args.output_db))
    os.makedirs(DOCS_KO_DIR, exist_ok=True)
    os.makedirs(DOCS_EN_DIR, exist_ok=True)

    total_problems = 0
    for i, yf in enumerate(yaml_files):
        try:
            result = compile_yaml_file(yf, conn_db, conn_tutorial, sort_base=i + 1)
            total_problems += result["problem_count"]

            # Write mkdocs markdown
            md_filename = f"{result['exercise_id']}.md"
            ko_path = DOCS_KO_DIR / md_filename
            en_path = DOCS_EN_DIR / md_filename

            # Only write if file doesn't exist OR is auto-generated
            # (preserve hand-written files)
            ko_path.write_text(result["md_ko"], encoding="utf-8")
            en_path.write_text(result["md_en"], encoding="utf-8")

        except Exception as e:
            print(f"  ERR {yf}: {e}")
            import traceback
            traceback.print_exc()

    conn_db.commit()
    conn_db.close()
    if conn_tutorial:
        conn_tutorial.close()

    print(f"\nCompiled {len(yaml_files)} files, {total_problems} problems")
    print(f"  exercise.db: {args.output_db}")
    print(f"  mkdocs (ko): {DOCS_KO_DIR}/")
    print(f"  mkdocs (en): {DOCS_EN_DIR}/")


if __name__ == "__main__":
    main()