core: use htmlparser2 for html rewriting

This replaces DOMParser with htmlparser2 which is much, much faster. How much faster? 80%. This new implementation can parse at 50mb/s which is insane! The old one could only do 5-10mb/s We still haven't gotten rid of the DOMParser though since HTML-to-MD conversion still needs it. This will be done soon though by using `dr-sax`. This uses a custom implementation of htmlparser2 instead of the default one which is 50% faster.
2025-12-23 15:09:33 +01:00 · 2022-11-10 15:16:13 +05:00
parent 349cd27650
commit 205373dca3
9 changed files with 378 additions and 108 deletions
--- a/packages/core/utils/html-diff.js
+++ b/packages/core/utils/html-diff.js
@@ -19,7 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import { Parser } from "htmlparser2";

-const ALLOWED_ATTRIBUTES = ["href", "src"];
+const ALLOWED_ATTRIBUTES = ["href", "src", "data-hash"];

 export function isHTMLEqual(one, two) {
  if (typeof one !== "string" || typeof two !== "string") return false;
@@ -29,13 +29,22 @@ export function isHTMLEqual(one, two) {

 function toDiffable(html) {
  let text = "";
-  const parser = new Parser({
-    ontext: (data) => (text += data.trim()),
-    onattribute: (name, value) => {
-      if (ALLOWED_ATTRIBUTES.includes(name)) text += value.trim();
+  const parser = new Parser(
+    {
+      ontext: (data) => (text += data.trim()),
+      onopentag: (_name, attr) => {
+        for (const key of ALLOWED_ATTRIBUTES) {
+          const value = attr[key];
+          if (!value) continue;
+          text += value.trim();
+        }
+      }
+    },
+    {
+      lowerCaseTags: false,
+      parseAttributes: true
    }
-  });
-  parser.write(html);
-  parser.end();
+  );
+  parser.end(html);
  return text;
 }