core: use upstream htmlparser2 instead of in-house

This is mostly because I am not 100% certain about how stable our own optimized version is. While it does perform a lot better, there is significant risk that things might break in production. To avoid that, I am replacing it with the upstream version which is much more heavily tested.
2025-12-16 11:47:54 +01:00 · 2022-11-23 14:58:34 +05:00
parent b1f5ffc2ec
commit 5e6caa7273
6 changed files with 3862 additions and 1497 deletions
--- a/packages/core/tests/notes.test.js
+++ b/packages/core/tests/notes.test.js
@@ -348,7 +348,7 @@ test("note content should not contain image base64 data after save", () =>
    const note = db.notes.note(id);
    const content = await note.content();
    expect(content).not.toContain(`src="data:image/png;`);
-    expect(content).toContain(`src=""`);
+    expect(content).not.toContain(`src=`);
  }));

 test("adding a note with an invalid tag should clean the tag array", () =>
--- a/packages/core/content-types/tests/tiptap.test.js
+++ b/packages/core/content-types/tests/tiptap.test.js
@@ -27,7 +27,7 @@ test("img src is empty after extract attachments", async () => {
  });
  expect(result.attachments).toHaveLength(1);
  expect(result.data).not.toContain(`src="data:image/png;`);
-  expect(result.data).toContain(`src=""`);
+  expect(result.data).not.toContain(`src=""`);
  expect(result.data).toContain(`data-hash="helloworld"`);
 });

--- a/packages/core/content-types/tiptap.js
+++ b/packages/core/content-types/tiptap.js
@@ -20,7 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import showdown from "@streetwriters/showdown";
 import dataurl from "../utils/dataurl";
 import { extractFirstParagraph, getDummyDocument } from "../utils/html-parser";
-import { Attributes, HTMLParser, HTMLRewriter } from "../utils/html-rewriter";
+import { HTMLParser, HTMLRewriter } from "../utils/html-rewriter";
 import { convert } from "html-to-text";
 import { list } from "html-to-text/lib/formatter";

@@ -101,7 +101,7 @@ export class Tiptap {
    let hashes = [];
    new HTMLParser({
      ontag: (name, attr) => {
-        const hash = Attributes.get(attr, ATTRIBUTES.hash);
+        const hash = attr[ATTRIBUTES.hash];
        if (name === "img" && hash) hashes.push(hash);
      }
    }).parse(this.data);
@@ -119,12 +119,11 @@ export class Tiptap {

    return new HTMLRewriter({
      ontag: (name, attr) => {
-        const hash = Attributes.get(attr, ATTRIBUTES.hash);
+        const hash = attr[ATTRIBUTES.hash];
        if (name === "img" && hash) {
-          const src = images[Attributes.get(attr, ATTRIBUTES.hash)];
+          const src = images[hash];
          if (!src) return;
-
-          return { name, attr: Attributes.set(attr, ATTRIBUTES.src, src) };
+          attr[ATTRIBUTES.src] = src;
        }
      }
    }).transform(this.data);
@@ -137,8 +136,7 @@ export class Tiptap {
  removeAttachments(hashes) {
    return new HTMLRewriter({
      ontag: (_name, attr) => {
-        if (hashes.includes(Attributes.get(attr, ATTRIBUTES.hash)))
-          return false;
+        if (hashes.includes(attr[ATTRIBUTES.hash])) return false;
      }
    }).transform(this.data);
  }
@@ -147,8 +145,8 @@ export class Tiptap {
    let sources = [];
    new HTMLParser({
      ontag: (name, attr, pos) => {
-        const hash = Attributes.get(attr, ATTRIBUTES.hash);
-        const src = Attributes.get(attr, ATTRIBUTES.src);
+        const hash = attr[ATTRIBUTES.hash];
+        const src = attr[ATTRIBUTES.src];
        if (name === "img" && !hash && src) {
          sources.push({
            src,
@@ -178,16 +176,13 @@ export class Tiptap {
      ontag: (name, attr, pos) => {
        switch (name) {
          case "img": {
-            const hash = Attributes.get(attr, ATTRIBUTES.hash);
+            const hash = attr[ATTRIBUTES.hash];

            if (hash) {
              attachments.push({
                hash
              });
-              return {
-                name,
-                attr: Attributes.set(attr, ATTRIBUTES.src, "")
-              };
+              delete attr[ATTRIBUTES.src];
            } else {
              const imageData = images[`${pos.start}${pos.end}`];
              if (!imageData) return imageData;
@@ -195,10 +190,8 @@ export class Tiptap {
              const { key, metadata, mime } = imageData;
              if (!metadata.hash) return;

-              const type =
-                Attributes.get(attr, ATTRIBUTES.mime) || mime || "image/jpeg";
-              const filename =
-                Attributes.get(attr, ATTRIBUTES.filename) || metadata.hash;
+              const type = attr[ATTRIBUTES.mime] || mime || "image/jpeg";
+              const filename = attr[ATTRIBUTES.filename] || metadata.hash;

              attachments.push({
                type,
@@ -207,18 +200,13 @@ export class Tiptap {
                key
              });

-              return {
-                name,
-                attr: Attributes.set(
-                  Attributes.set(attr, ATTRIBUTES.hash, metadata.hash),
-                  ATTRIBUTES.src,
-                  ""
-                )
-              };
+              attr[ATTRIBUTES.hash] = metadata.hash;
+              delete attr[ATTRIBUTES.src];
            }
+            break;
          }
          case "span": {
-            const hash = Attributes.get(attr, ATTRIBUTES.hash);
+            const hash = attr[ATTRIBUTES.hash];
            if (!hash) return;
            attachments.push({
              hash
--- a/packages/core/package-lock.json
+++ b/packages/core/package-lock.json
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -40,13 +40,16 @@
    "async-mutex": "^0.3.2",
    "dayjs": "^1.11.3",
    "entities": "^4.3.1",
-    "htmlparser2": "github:thecodrr/htmlparser2",
    "html-to-text": "github:thecodrr/node-html-to-text",
+    "htmlparser2": "^8.0.1",
    "linkedom": "^0.14.17",
    "liqe": "^1.13.0",
    "qclone": "^1.2.0",
    "spark-md5": "^3.0.2"
  },
+  "overrides": {
+    "htmlparser2": "^8.0.1"
+  },
  "np": {
    "releaseDraft": false,
    "message": "chore: bump version to %s"
--- a/packages/core/utils/html-rewriter.js
+++ b/packages/core/utils/html-rewriter.js
@@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-import { Parser, Attributes } from "htmlparser2";
+import { Parser } from "htmlparser2";

 export class HTMLRewriter {
  /**
@@ -73,7 +73,12 @@ export class HTMLRewriter {
          }

          this.write(`<${name}`);
-          if (attr) this.write(` ${attr}`);
+          if (attr) {
+            for (const key in attr) {
+              if (!key) continue;
+              this.write(` ${key}="${attr[key]}"`);
+            }
+          }
          this.currentTag = name;
        },
        onclosetag: (name, isImplied) => {
@@ -111,8 +116,7 @@ export class HTMLRewriter {
        decodeEntities: false,
        lowerCaseAttributeNames: false,
        lowerCaseTags: false,
-        recognizeCDATA: false,
-        parseAttributes: false
+        recognizeCDATA: false
      }
    );
  }
@@ -171,8 +175,7 @@ export class HTMLParser {
        decodeEntities: false,
        lowerCaseAttributeNames: false,
        lowerCaseTags: false,
-        recognizeCDATA: false,
-        parseAttributes: false
+        recognizeCDATA: false
      }
    );
  }
@@ -182,5 +185,3 @@ export class HTMLParser {
    this.parser.reset();
  }
 }
-
-export { Attributes };