[vnotex] support inline formula

tamlok · tamlok · commit 3d68f77383ca · 2025-03-14T22:39:47.000+08:00
diff --git a/parser_test/CMakeLists.txt b/parser_test/CMakeLists.txt
@@ -3,7 +3,9 @@ set(TEST_TARGETS
     strikethrough_tests
     image_tests
     block_quote_tests
-    mark_tests)
+    mark_tests
+    formula_inline_tests
+    code_tests)
 
 foreach(TARGET ${TEST_TARGETS})
     add_executable(${TARGET} ${TARGET}.c test_utils.c)
diff --git a/parser_test/code_tests.c b/parser_test/code_tests.c
@@ -0,0 +1,87 @@
+#include <cmark.h>
+
+#include "test_utils.h"
+
+int test_code_simple() {
+  return test_xml("`code`",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document sourcepos=\"1:1-1:6\" xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph sourcepos=\"1:1-1:6\">\n"
+      "    <code sourcepos=\"1:1-1:6\" xml:space=\"preserve\">code</code>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_SOURCEPOS);
+}
+
+int test_code_multiple() {
+  return test_xml("`first` and `second`",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <code xml:space=\"preserve\">first</code>\n"
+      "    <text xml:space=\"preserve\"> and </text>\n"
+      "    <code xml:space=\"preserve\">second</code>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_code_with_spaces() {
+  return test_xml("`  code  with  spaces  `",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <code xml:space=\"preserve\"> code  with  spaces </code>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_code_double_backticks() {
+  return test_xml("``code with `backticks` inside``",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <code xml:space=\"preserve\">code with `backticks` inside</code>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_code_not_closed() {
+  return test_xml("`not closed",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <text xml:space=\"preserve\">`not closed</text>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_code_empty() {
+  return test_xml("``",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <text xml:space=\"preserve\">``</text>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int main() {
+  CASE(test_code_simple);
+  CASE(test_code_multiple);
+  CASE(test_code_with_spaces);
+  CASE(test_code_double_backticks);
+  CASE(test_code_not_closed);
+  CASE(test_code_empty);
+  return 0;
+}
diff --git a/parser_test/formula_inline_tests.c b/parser_test/formula_inline_tests.c
@@ -0,0 +1,74 @@
+#include <cmark.h>
+
+#include "test_utils.h"
+
+int test_formula_inline_simple() {
+  return test_xml("$E=mc^2$",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <formula_inline xml:space=\"preserve\">E=mc^2</formula_inline>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_formula_inline_multiple() {
+  return test_xml("$a+b$ and $c+d$",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <formula_inline xml:space=\"preserve\">a+b</formula_inline>\n"
+      "    <text xml:space=\"preserve\"> and </text>\n"
+      "    <formula_inline xml:space=\"preserve\">c+d</formula_inline>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_formula_inline_with_escape() {
+  return test_xml("$a\\$b$",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <formula_inline xml:space=\"preserve\">a$b</formula_inline>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_formula_inline_not_closed() {
+  return test_xml("$formula",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <text xml:space=\"preserve\">$formula</text>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int test_formula_inline_empty() {
+  return test_xml("$$",
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+      "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
+      "  <paragraph>\n"
+      "    <text xml:space=\"preserve\">$$</text>\n"
+      "  </paragraph>\n"
+      "</document>\n",
+      CMARK_OPT_DEFAULT);
+}
+
+int main() {
+  CASE(test_formula_inline_simple);
+  CASE(test_formula_inline_multiple);
+  CASE(test_formula_inline_with_escape);
+  CASE(test_formula_inline_not_closed);
+  CASE(test_formula_inline_empty);
+  return 0;
+}
diff --git a/parser_test/image_tests.c b/parser_test/image_tests.c
@@ -5,14 +5,14 @@ static int test_basic_image() {
     const char *markdown = "![Alt text](image.png)\n";
     const char *expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
                            "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
-                           "<document xmlns=\"http://commonmark.org/xml/1.0\">\n"
-                           "  <paragraph>\n"
-                           "    <image destination=\"image.png\">\n"
-                           "      <text xml:space=\"preserve\">Alt text</text>\n"
+                           "<document sourcepos=\"1:1-1:22\" xmlns=\"http://commonmark.org/xml/1.0\">\n"
+                           "  <paragraph sourcepos=\"1:1-1:22\">\n"
+                           "    <image sourcepos=\"1:1-1:22\" destination=\"image.png\">\n"
+                           "      <text sourcepos=\"1:3-1:10\" xml:space=\"preserve\">Alt text</text>\n"
                            "    </image>\n"
                            "  </paragraph>\n"
                            "</document>\n";
-    return test_xml(markdown, expected, CMARK_OPT_DEFAULT);
+    return test_xml(markdown, expected, CMARK_OPT_SOURCEPOS);
 }
 
 static int test_image_with_title() {
diff --git a/src/buffer.c b/src/buffer.c
@@ -207,3 +207,17 @@ void cmark_strbuf_unescape(cmark_strbuf *buf) {
 
   cmark_strbuf_truncate(buf, w);
 }
+
+// Destructively unescape a string: remove backslashes before @c.
+void cmark_strbuf_unescape_char(cmark_strbuf *buf, char c) {
+  bufsize_t r, w;
+
+  for (r = 0, w = 0; r < buf->size; ++r) {
+    if (buf->ptr[r] == '\\' && buf->ptr[r + 1] == c)
+      r++;
+
+    buf->ptr[w++] = buf->ptr[r];
+  }
+
+  cmark_strbuf_truncate(buf, w);
+}
diff --git a/src/buffer.h b/src/buffer.h
@@ -66,6 +66,8 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf);
 void cmark_strbuf_trim(cmark_strbuf *buf);
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);
 void cmark_strbuf_unescape(cmark_strbuf *s);
+// Unescape the @c.
+void cmark_strbuf_unescape_char(cmark_strbuf *s, char c);
 
 #ifdef __cplusplus
 }
diff --git a/src/cmark.h b/src/cmark.h
@@ -61,6 +61,7 @@ typedef enum {
   CMARK_NODE_STRONG,
   CMARK_NODE_STRIKETHROUGH,
   CMARK_NODE_MARK,
+  CMARK_NODE_FORMULA_INLINE,
   CMARK_NODE_LINK,
   CMARK_NODE_IMAGE,
 
@@ -691,6 +692,9 @@ const char *cmark_version_string(void);
 #define NODE_CUSTOM_INLINE CMARK_NODE_CUSTOM_INLINE
 #define NODE_EMPH CMARK_NODE_EMPH
 #define NODE_STRONG CMARK_NODE_STRONG
+#define NODE_STRIKETHROUGH CMARK_NODE_STRIKETHROUGH
+#define NODE_MARK CMARK_NODE_MARK
+#define NODE_FORMULA_INLINE CMARK_NODE_FORMULA_INLINE
 #define NODE_LINK CMARK_NODE_LINK
 #define NODE_IMAGE CMARK_NODE_IMAGE
 #define BULLET_LIST CMARK_BULLET_LIST
diff --git a/src/inlines.c b/src/inlines.c
@@ -357,6 +357,41 @@ static bufsize_t scan_to_closing_backticks(subject *subj,
   return 0;
 }
 
+
+// Try to process a dollar inline formula span that began with a
+// dollar (already parsed). Return 0 if you don't find matching closing
+// dollar, otherwise return the position in the subject
+// after the closing dollar.
+static bufsize_t scan_to_closing_dollar(subject *subj) {
+  bufsize_t startpos = subj->pos;
+
+  // read non dollar
+  unsigned char c;
+  size_t slash_cnt = 0;
+  // Directly skip the escaped dollar.
+  while ((c = peek_char(subj)) && (c != '$' || slash_cnt % 2 == 1)) {
+    if (c == '\\') {
+      ++slash_cnt;
+    } else {
+      slash_cnt = 0;
+    }
+    if (c == '\r' || c == '\n') {
+      // Line break is not allowed.
+      break;
+    }
+    advance(subj);
+  }
+  if (!is_eof(subj) && c == '$') {
+    advance(subj);
+    return (subj->pos);
+  } else {
+    // Rewind it.
+    subj->pos = startpos;
+    return 0;
+  }
+}
+
+
 // Destructively modify string, converting newlines to
 // spaces, then removing a single leading + trailing space,
 // unless the code span consists entirely of space characters.
@@ -412,8 +447,9 @@ static cmark_node *handle_backticks(subject *subj, int options) {
                      endpos - startpos - openticks.len);
     S_normalize_code(&buf);
 
-    cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos,
-                                    endpos - openticks.len - 1);
+    // VNoteX: let's fix it to include the ticks.
+    cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos - openticks.len,
+                                    endpos - 1);
     node->len = buf.size;
     node->data = cmark_strbuf_detach(&buf);
     adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
@@ -422,6 +458,70 @@ static cmark_node *handle_backticks(subject *subj, int options) {
 }
 
 
+// Parse dollar inline formula section or raw dollar, return an inline.
+// Assumes that the subject has a dollar at the current position.
+static cmark_node *handle_dollar(subject *subj) {
+  bufsize_t initpos = subj->pos;
+  // Skip the open dollar.
+  advance(subj);
+  bufsize_t startpos = subj->pos;
+
+  // Pre check.
+  if (subj->pos > 1) {
+    unsigned char before_char = peek_at(subj, subj->pos - 2);
+    if (before_char == '$' ||
+        (before_char >= '0' && before_char <= '9') ||
+        before_char == '\\') {
+      // Not a legal open dollar.
+      return make_str(subj, initpos, initpos, cmark_chunk_literal("$"));
+    }
+  }
+
+  bufsize_t endpos = scan_to_closing_dollar(subj);
+  if (endpos == 0) {
+    subj->pos = startpos;
+    return make_str(subj, initpos, initpos, cmark_chunk_literal("$"));
+  }
+
+  // Post check.
+  {
+    // $$ is invalid.
+    if (endpos - startpos == 1) {
+      return make_str(subj, initpos, startpos, cmark_chunk_literal("$$"));
+    }
+
+    // No space before the closing dollar.
+    unsigned char before_char = peek_at(subj, endpos - 2);
+    if (endpos - startpos == 1 || before_char == ' ' || before_char == '\t') {
+      // Not a legal closing dollar.
+      subj->pos = startpos;
+      return make_str(subj, initpos, initpos, cmark_chunk_literal("$"));
+    }
+
+    // No digit after the closing dollar.
+    if (endpos < subj->input.len) {
+      unsigned char after_char = peek_at(subj, endpos);
+      if (after_char >= '0' && after_char <= '9') {
+        // Not a legal closing dollar.
+        subj->pos = startpos;
+        return make_str(subj, initpos, initpos, cmark_chunk_literal("$"));
+      }
+    }
+  }
+
+  cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
+
+  cmark_strbuf_set(&buf, subj->input.data + startpos,
+                   endpos - startpos - 1);
+  cmark_strbuf_unescape_char(&buf, '$');
+
+  cmark_node *node = make_literal(subj, CMARK_NODE_FORMULA_INLINE, startpos, endpos - 2);
+  node->len = buf.size;
+  node->data = cmark_strbuf_detach(&buf);
+  return node;
+}
+
+
 // Scan ***, **, or * and return number scanned, or 0.
 // Advances position.
 static int scan_delims(subject *subj, unsigned char c, bool *can_open,
@@ -1330,10 +1430,10 @@ static cmark_node *handle_newline(subject *subj) {
 
 static bufsize_t subject_find_special_char(subject *subj, int options) {
   // "\r\n\\`&_*[]<!"
-  // Add '~', '='.
+  // Add '~', '=', '$'.
   static const int8_t SPECIAL_CHARS[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -1391,6 +1491,9 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
   case '`':
     new_inl = handle_backticks(subj, options);
     break;
+  case '$':
+    new_inl = handle_dollar(subj);
+    break;
   case '\\':
     new_inl = handle_backslash(subj);
     break;
diff --git a/src/node.c b/src/node.c
diff --git a/src/xml.c b/src/xml.c

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,8 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf);`
`66`	`66`	`void cmark_strbuf_trim(cmark_strbuf *buf);`
`67`	`67`	`void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);`
`68`	`68`	`void cmark_strbuf_unescape(cmark_strbuf *s);`
	`69`	`+// Unescape the @c.`
	`70`	`+void cmark_strbuf_unescape_char(cmark_strbuf *s, char c);`
`69`	`71`
`70`	`72`	`#ifdef __cplusplus`
`71`	`73`	`}`