core: use htmlparser2 for html rewriting

This replaces DOMParser with htmlparser2 which is much, much faster.
How much faster? 80%. This new implementation can parse at 50mb/s
which is insane! The old one could only do 5-10mb/s

We still haven't gotten rid of the DOMParser though since HTML-to-MD
conversion still needs it. This will be done soon though by using `dr-sax`.

This uses a custom implementation of htmlparser2 instead of the default
one which is 50% faster.
This commit is contained in:
Abdullah Atta
2022-11-10 15:16:13 +05:00
parent 349cd27650
commit 205373dca3
9 changed files with 378 additions and 108 deletions

View File

@@ -19,7 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import { Parser } from "htmlparser2";
const ALLOWED_ATTRIBUTES = ["href", "src"];
const ALLOWED_ATTRIBUTES = ["href", "src", "data-hash"];
export function isHTMLEqual(one, two) {
if (typeof one !== "string" || typeof two !== "string") return false;
@@ -29,13 +29,22 @@ export function isHTMLEqual(one, two) {
function toDiffable(html) {
let text = "";
const parser = new Parser({
ontext: (data) => (text += data.trim()),
onattribute: (name, value) => {
if (ALLOWED_ATTRIBUTES.includes(name)) text += value.trim();
const parser = new Parser(
{
ontext: (data) => (text += data.trim()),
onopentag: (_name, attr) => {
for (const key of ALLOWED_ATTRIBUTES) {
const value = attr[key];
if (!value) continue;
text += value.trim();
}
}
},
{
lowerCaseTags: false,
parseAttributes: true
}
});
parser.write(html);
parser.end();
);
parser.end(html);
return text;
}