core: use htmlparser2 for html rewriting

This replaces DOMParser with htmlparser2 which is much, much faster.
How much faster? 80%. This new implementation can parse at 50mb/s
which is insane! The old one could only do 5-10mb/s

We still haven't gotten rid of the DOMParser though since HTML-to-MD
conversion still needs it. This will be done soon though by using `dr-sax`.

This uses a custom implementation of htmlparser2 instead of the default
one which is 50% faster.
This commit is contained in:
Abdullah Atta
2022-11-10 15:16:13 +05:00
parent 349cd27650
commit 205373dca3
9 changed files with 378 additions and 108 deletions

View File

@@ -106,7 +106,7 @@ test("note should not get headline if there is no p tag", () =>
}
}).then(async ({ db, id }) => {
let note = db.notes.note(id);
expect(note.headline).toBeUndefined();
expect(note.headline).toBe("");
}));
test("note title should allow trailing space", () =>
@@ -132,7 +132,7 @@ test("update note", () =>
title: "I am a new title",
content: {
type: TEST_NOTE.content.type,
data: "<p><br></p>"
data: "<p><br/></p>"
},
pinned: true,
favorite: true
@@ -348,7 +348,7 @@ test("note content should not contain image base64 data after save", () =>
const note = db.notes.note(id);
const content = await note.content();
expect(content).not.toContain(`src="data:image/png;`);
expect(content).not.toContain(`src=`);
expect(content).toContain(`src=""`);
}));
test("adding a note with an invalid tag should clean the tag array", () =>

View File

@@ -49,7 +49,7 @@ const notebookTest = (notebook = TEST_NOTEBOOK) =>
var TEST_NOTE = {
content: {
type: "tiptap",
data: `<p>Hello<br><span style="color:#f00">This is colorful</span></p>`
data: `<p>Hello<br/><span style="color:#f00">This is colorful</span></p>`
}
};

View File

@@ -27,7 +27,7 @@ test("img src is empty after extract attachments", async () => {
});
expect(result.attachments).toHaveLength(1);
expect(result.data).not.toContain(`src="data:image/png;`);
expect(result.data).not.toContain(`src=`);
expect(result.data).toContain(`src=""`);
expect(result.data).toContain(`data-hash="helloworld"`);
});
@@ -41,6 +41,12 @@ test("img src is present after insert attachments", async () => {
expect(result2).toContain(`src="i am a data"`);
});
test("remove attachments with particular hash", async () => {
const tiptap = new Tiptap(IMG_CONTENT);
const result = tiptap.removeAttachments(["d3eab72e94e3cd35"]);
expect(result).not.toContain(`d3eab72e94e3cd35`);
});
const HTMLS = {
tables: `<table style="border-collapse: collapse; width: 811px;" border="1"><tbody><tr><td style="width: 81.375px;">Goal</td><td style="width: 708.625px;">To introduce various features of the app to the user and to convert a user on trial or basic plan to upgrade.</td></tr><tr><td style="width: 81.375px;">Frequency</td><td style="width: 708.625px;">1/week or 2/week</td></tr><tr><td style="width: 81.375px;">Types</td><td style="width: 708.625px;">Feature intro, upgrade promo, one time emails</td></tr><tr><td style="width: 81.375px;"></td><td style="width: 708.625px;"></td></tr></tbody></table><h2>Emails</h2><h3>Feature intro</h3><p>Features:</p><ol style="list-style-type: decimal;"><li>Web clipper on mobile</li><li>Pin any note to notification</li><li>Take notes from notifications</li><li>App lock</li><li>Importer</li><li>Encrypted attachments</li><li>Session history &amp; automatic backups</li><li>Note publishing</li><li>Note exports</li><li>Collapsible headers</li></ol><h3>Promos</h3><ol style="list-style-type: decimal;"><li>Trial about to end</li><li>Trial ending (with option to request an extension)</li><li>Try free for 14 days</li></ol><h3>One time</h3><ol style="list-style-type: decimal;"><li>End-of-month progress report</li><li>What's coming/roadmap</li><li>What we are working on</li><li>Join the community</li></ol>`,
tables2: `<h1>Note 8/6/22, 10:48 AM</h1>

View File

@@ -19,7 +19,19 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import showdown from "@streetwriters/showdown";
import dataurl from "../utils/dataurl";
import { getDummyDocument, parseHTML } from "../utils/html-parser";
import {
extractFirstParagraph,
getDummyDocument,
parseHTML
} from "../utils/html-parser";
import { Attributes, HTMLParser, HTMLRewriter } from "../utils/html-rewriter";
const ATTRIBUTES = {
hash: "data-hash",
mime: "data-mime",
filename: "data-filename",
src: "src"
};
showdown.helper.document = getDummyDocument();
var converter = new showdown.Converter();
@@ -38,6 +50,7 @@ export class Tiptap {
}
toTXT() {
if (!this.document) this.document = parseHTML(this.data);
return this.document.body.innerText;
}
@@ -46,15 +59,12 @@ export class Tiptap {
}
toHeadline() {
const paragraph = this.document.querySelector("p");
if (!paragraph) return;
return paragraph.innerText;
return extractFirstParagraph(this.data);
}
isEmpty() {
return this.toTXT().trim().length <= 0;
}
// isEmpty() {
// return this.toTXT().trim().length <= 0;
// }
/**
* @returns {Boolean}
@@ -66,109 +76,140 @@ export class Tiptap {
}
async insertMedia(getData) {
const attachmentElements = this.document.querySelectorAll("img");
for (var i = 0; i < attachmentElements.length; ++i) {
const attachment = attachmentElements[i];
switch (attachment.tagName) {
case "IMG": {
const hash = getDatasetAttribute(attachment, "hash");
if (!hash) continue;
let hashes = [];
new HTMLParser({
ontag: (name, attr) => {
const hash = Attributes.get(attr, ATTRIBUTES.hash);
if (name === "img" && hash) hashes.push(hash);
}
}).parse(this.data);
const src = await getData(hash, {
total: attachmentElements.length,
current: i
});
if (!src) continue;
attachment.setAttribute("src", src);
break;
const images = {};
for (let i = 0; i < hashes.length; ++i) {
const hash = hashes[i];
const src = await getData(hash, {
total: hashes.length,
current: i
});
if (!src) continue;
images[hash] = src;
}
return new HTMLRewriter({
ontag: (name, attr) => {
const hash = Attributes.get(attr, ATTRIBUTES.hash);
if (name === "img" && hash) {
const src = images[Attributes.get(attr, ATTRIBUTES.hash)];
if (!src) return;
return { name, attr: Attributes.set(attr, ATTRIBUTES.src, src) };
}
}
}
return this.document.body.innerHTML;
}).transform(this.data);
}
/**
* @param {string[]} hashes
* @returns
*/
removeAttachments(hashes) {
const query = hashes.map((h) => `[data-hash="${h}"]`).join(",");
const attachmentElements = this.document.querySelectorAll(query);
for (var i = 0; i < attachmentElements.length; ++i) {
const attachment = attachmentElements[i];
attachment.remove();
}
return this.document.body.innerHTML;
return new HTMLRewriter({
ontag: (_name, attr) => {
if (hashes.includes(Attributes.get(attr, ATTRIBUTES.hash)))
return false;
}
}).transform(this.data);
}
async extractAttachments(store) {
const attachments = [];
const attachmentElements = this.document.querySelectorAll("img,span");
let sources = [];
new HTMLParser({
ontag: (name, attr, pos) => {
const hash = Attributes.get(attr, ATTRIBUTES.hash);
const src = Attributes.get(attr, ATTRIBUTES.src);
if (name === "img" && !hash && src) {
sources.push({
src,
id: `${pos.start}${pos.end}`
});
}
}
}).parse(this.data);
for (var i = 0; i < attachmentElements.length; ++i) {
const attachment = attachmentElements[i];
const images = {};
for (const image of sources) {
try {
switch (attachment.tagName) {
case "IMG": {
if (!getDatasetAttribute(attachment, "hash")) {
const src = attachment.getAttribute("src");
if (!src) continue;
const { data, mime } = dataurl.toObject(image.src);
if (!data) continue;
const storeResult = await store(data, "base64");
if (!storeResult) continue;
const { data, mime } = dataurl.toObject(src);
if (!data) continue;
images[image.id] = { ...storeResult, mime };
} catch (e) {
console.error(e);
images[image.id] = false;
}
}
let attachments = [];
const html = new HTMLRewriter({
ontag: (name, attr, pos) => {
switch (name) {
case "img": {
const hash = Attributes.get(attr, ATTRIBUTES.hash);
if (hash) {
attachments.push({
hash
});
return {
name,
attr: Attributes.set(attr, ATTRIBUTES.src, "")
};
} else {
const imageData = images[`${pos.start}${pos.end}`];
if (!imageData) return imageData;
const { key, metadata, mime } = imageData;
if (!metadata.hash) return;
const type =
getDatasetAttribute(attachment, "mime") || mime || "image/jpeg";
const storeResult = await store(data, "base64");
if (!storeResult) continue;
const { key, metadata } = storeResult;
if (!metadata.hash) continue;
setDatasetAttribute(attachment, "hash", metadata.hash);
Attributes.get(attr, ATTRIBUTES.mime) || mime || "image/jpeg";
const filename =
Attributes.get(attr, ATTRIBUTES.filename) || metadata.hash;
attachments.push({
type,
filename:
getDatasetAttribute(attachment, "filename") || metadata.hash,
filename,
...metadata,
key
});
} else {
attachments.push({
hash: getDatasetAttribute(attachment, "hash")
});
return {
name,
attr: Attributes.set(
Attributes.set(attr, ATTRIBUTES.hash, metadata.hash),
ATTRIBUTES.src,
""
)
};
}
attachment.removeAttribute("src");
break;
}
default: {
if (!getDatasetAttribute(attachment, "hash")) continue;
case "span": {
const hash = Attributes.get(attr, ATTRIBUTES.hash);
if (!hash) return;
attachments.push({
hash: getDatasetAttribute(attachment, "hash")
hash
});
break;
}
}
} catch (e) {
if (e.message === "bad base-64") {
attachment.remove();
console.error(e);
continue;
}
throw e;
}
}
}).transform(this.data);
return {
data: this.document.body.innerHTML,
data: html,
attachments
};
}
}
function getDatasetAttribute(element, attribute) {
return element.getAttribute(`data-${attribute}`);
}
function setDatasetAttribute(element, attribute, value) {
return element.setAttribute(`data-${attribute}`, value);
}

View File

@@ -17,7 +17,7 @@
"dayjs": "^1.11.3",
"entities": "^4.3.1",
"fflate": "^0.7.3",
"htmlparser2": "^8.0.1",
"htmlparser2": "github:thecodrr/htmlparser2",
"linkedom": "^0.14.17",
"liqe": "^1.13.0",
"qclone": "^1.2.0",
@@ -5316,7 +5316,8 @@
"license": "MIT"
},
"node_modules/htmlparser2": {
"version": "8.0.1",
"version": "8.1.1",
"resolved": "git+ssh://git@github.com/thecodrr/htmlparser2.git#fed70e52a067bd16b72f5ae71bc7020a34a2d8c9",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
@@ -5327,9 +5328,9 @@
"license": "MIT",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"entities": "^4.3.0"
"entities": "^4.4.0"
}
},
"node_modules/http-proxy-agent": {
@@ -12530,12 +12531,13 @@
"dev": true
},
"htmlparser2": {
"version": "8.0.1",
"version": "git+ssh://git@github.com/thecodrr/htmlparser2.git#fed70e52a067bd16b72f5ae71bc7020a34a2d8c9",
"from": "htmlparser2@github:thecodrr/htmlparser2",
"requires": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"entities": "^4.3.0"
"entities": "^4.4.0"
}
},
"http-proxy-agent": {
@@ -13202,8 +13204,7 @@
},
"jest-pnp-resolver": {
"version": "1.2.2",
"dev": true,
"requires": {}
"dev": true
},
"jest-regex-util": {
"version": "28.0.2",
@@ -13745,8 +13746,7 @@
"dependencies": {
"ws": {
"version": "8.9.0",
"dev": true,
"requires": {}
"dev": true
}
}
},
@@ -14657,8 +14657,7 @@
}
},
"ws": {
"version": "7.5.9",
"requires": {}
"version": "7.5.9"
},
"xml-name-validator": {
"version": "4.0.0",

View File

@@ -42,7 +42,7 @@
"dayjs": "^1.11.3",
"entities": "^4.3.1",
"fflate": "^0.7.3",
"htmlparser2": "^8.0.1",
"htmlparser2": "github:thecodrr/htmlparser2",
"linkedom": "^0.14.17",
"liqe": "^1.13.0",
"qclone": "^1.2.0",

View File

@@ -19,7 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import { Parser } from "htmlparser2";
const ALLOWED_ATTRIBUTES = ["href", "src"];
const ALLOWED_ATTRIBUTES = ["href", "src", "data-hash"];
export function isHTMLEqual(one, two) {
if (typeof one !== "string" || typeof two !== "string") return false;
@@ -29,13 +29,22 @@ export function isHTMLEqual(one, two) {
function toDiffable(html) {
let text = "";
const parser = new Parser({
ontext: (data) => (text += data.trim()),
onattribute: (name, value) => {
if (ALLOWED_ATTRIBUTES.includes(name)) text += value.trim();
const parser = new Parser(
{
ontext: (data) => (text += data.trim()),
onopentag: (_name, attr) => {
for (const key of ALLOWED_ATTRIBUTES) {
const value = attr[key];
if (!value) continue;
text += value.trim();
}
}
},
{
lowerCaseTags: false,
parseAttributes: true
}
});
parser.write(html);
parser.end();
);
parser.end(html);
return text;
}

View File

@@ -18,6 +18,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import { decodeHTML5 } from "entities";
import { Parser } from "htmlparser2";
export const parseHTML = (input) =>
new globalThis.DOMParser().parseFromString(
@@ -40,3 +41,31 @@ function wrapIntoHTMLDocument(input) {
return `<!doctype html><html lang="en"><head><title>Document Fragment</title></head><body>${input}</body></html>`;
}
export function extractFirstParagraph(html) {
let text = "";
let start = false;
const parser = new Parser(
{
onopentag: (name) => {
if (name === "p") start = true;
},
onclosetag: (name) => {
if (name === "p") {
start = false;
parser.pause();
parser.reset();
}
},
ontext: (data) => {
if (start) text += data;
}
},
{
lowerCaseTags: false,
decodeEntities: true
}
);
parser.end(html);
return text;
}

View File

@@ -0,0 +1,186 @@
/*
This file is part of the Notesnook project (https://notesnook.com/)
Copyright (C) 2022 Streetwriters (Private) Limited
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import { Parser, Attributes } from "htmlparser2";
export class HTMLRewriter {
/**
*
* @param {{
* ontag?: (name: string, attr: string, pos: {start: number, end: number}) => false | {name: string, attr: string} | undefined
* }} options
*/
constructor(options = {}) {
const { ontag } = options;
/**
* @private
*/
this.transformed = "";
/** @private */
this.currentTag = null;
/** @private */
this.ignoreIndex = null;
/**
* @private
*/
this.parser = new Parser(
{
onreset: () => {
this.transformed = "";
},
oncomment: () => this.write("<!--"),
oncommentend: () => this.write("-->"),
onopentag: (name, attr) => {
if (this.ignoreIndex !== null) {
this.ignoreIndex++;
return;
}
this.closeTag();
if (ontag) {
const result = ontag(name, attr, {
start: this.parser.startIndex,
end: this.parser.endIndex
});
if (result === false) {
this.ignoreIndex = 0;
return;
} else if (result) {
name = result.name;
attr = result.attr;
}
}
this.write(`<${name}`);
if (attr) this.write(` ${attr}`);
this.currentTag = name;
},
onclosetag: (name, isImplied) => {
if (this.ignoreIndex === 0) {
this.ignoreIndex = null;
return;
}
if (this.ignoreIndex !== null) {
this.ignoreIndex--;
return;
}
if (!isImplied) this.closeTag();
this.write(isImplied ? "/>" : `</${name}>`);
if (this.currentTag) {
this.currentTag = null;
}
},
ontext: (data) => {
if (this.ignoreIndex !== null) {
return;
}
this.closeTag();
this.write(data);
}
},
{
recognizeSelfClosing: true,
xmlMode: false,
decodeEntities: false,
lowerCaseAttributeNames: false,
lowerCaseTags: false,
recognizeCDATA: false,
parseAttributes: false
}
);
}
/**
* @private
*/
closeTag() {
if (this.currentTag) {
this.write(">");
this.currentTag = null;
}
}
transform(html) {
this.parser.end(html);
return this.transformed;
}
end() {
this.parser.reset();
}
/**
* @private
*/
write(html) {
this.transformed += html;
}
}
export class HTMLParser {
/**
*
* @param {{
* ontag?: (name: string, attr: Record<string, string>, pos: {start: number, end: number}) => void
* }} options
*/
constructor(options = {}) {
const { ontag } = options;
/**
* @private
*/
this.parser = new Parser(
{
onopentag: (name, attr) =>
ontag(name, attr, {
start: this.parser.startIndex,
end: this.parser.endIndex
})
},
{
recognizeSelfClosing: true,
xmlMode: false,
decodeEntities: false,
lowerCaseAttributeNames: false,
lowerCaseTags: false,
recognizeCDATA: false,
parseAttributes: false
}
);
}
parse(html) {
this.parser.end(html);
this.parser.reset();
}
}
export { Attributes };