editor: add support for pasting as markdown

2025-12-16 19:57:52 +01:00 · 2025-02-04 13:39:19 +05:00
parent 49ffcbea03
commit c16bbcaad5
5 changed files with 3324 additions and 23 deletions
--- a/packages/editor/package-lock.json
+++ b/packages/editor/package-lock.json
--- a/packages/editor/package.json
+++ b/packages/editor/package.json
@@ -30,6 +30,7 @@
  },
  "license": "GPL-3.0-or-later",
  "dependencies": {
    "@notesnook-importer/core": "^2.1.1",
    "@notesnook/common": "file:../common",
    "@notesnook/intl": "file:../intl",
    "@notesnook/theme": "file:../theme",
--- a/packages/editor/src/extensions/clipboard/clipboard-dom-parser.ts
+++ b/packages/editor/src/extensions/clipboard/clipboard-dom-parser.ts
@@ -22,7 +22,7 @@ import {
  ParseOptions
 } from "@tiptap/pm/model";
 import { encodeNonAsciiHTML } from "entities";
-import { Schema, Slice } from "prosemirror-model";
+import { Node as PMNode, Schema, Slice } from "prosemirror-model";
 import { inferLanguage } from "../code-block/index.js";
 import { hasPermission } from "../../types.js";
@@ -47,6 +47,17 @@ export class ClipboardDOMParser extends ProsemirrorDOMParser {
    }
    return super.parseSlice(dom, options);
  }
  parse(dom: Node, options?: ParseOptions): PMNode {
    if (dom instanceof HTMLElement || dom instanceof Document) {
      convertGoogleDocsChecklist(dom);
      formatCodeblocks(dom);
      convertBrToSingleSpacedParagraphs(dom);
      removeImages(dom);
      removeBlockId(dom);
    }
    return super.parse(dom, options);
  }
 }
 export function removeBlockId(dom: HTMLElement | Document) {
--- a/packages/editor/src/extensions/clipboard/clipboard-text-parser.ts
+++ b/packages/editor/src/extensions/clipboard/clipboard-text-parser.ts
@@ -21,19 +21,34 @@ import { ResolvedPos, Slice } from "@tiptap/pm/model";
 import { encodeNonAsciiHTML } from "entities";
 import { ClipboardDOMParser } from "./clipboard-dom-parser.js";
 import { EditorView } from "@tiptap/pm/view";
 import { markdowntoHTML } from "@notesnook-importer/core/dist/src/utils/to-html.js";
 export function clipboardTextParser(
  text: string,
-  _$context: ResolvedPos,
+  $context: ResolvedPos,
-  _plain: boolean,
+  plain: boolean,
  view: EditorView
 ): Slice {
  if (!plain && isProbablyMarkdown(text)) {
    const node = ClipboardDOMParser.fromSchema(view.state.schema).parse(
      new DOMParser().parseFromString(
        markdowntoHTML(text, { allowDangerousHtml: false }),
        "text/html"
      ),
      {
        context: $context
      }
    );
    return node.slice(0);
  }
  const doc = new DOMParser().parseFromString(
    convertTextToHTML(text),
    "text/html"
  );
  return ClipboardDOMParser.fromSchema(view.state.schema).parseSlice(doc, {
-    preserveWhitespace: "full"
+    preserveWhitespace: "full",
    context: $context
  });
 }
@@ -58,3 +73,166 @@ function encodeLine(line: string) {
  });
  return line;
 }
 interface MarkdownPattern {
  pattern: RegExp;
  score: number;
  type: string;
 }
 const DEFINITE_PATTERNS: MarkdownPattern[] = [
  { pattern: /^[\t ]*#{1,6}\s+\S/, score: 0, type: "header" },
  { pattern: /^\s*[-*+]\s+\[[ x]\]/, score: 0, type: "task" },
  { pattern: /^\|.+\|.+\|$/, score: 0, type: "table" },
  { pattern: /^[\t ]*>\s+.+/, score: 0, type: "blockquote" },
  { pattern: /!\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 0, type: "image" }
 ];
 const MARKDOWN_PATTERNS: MarkdownPattern[] = [
  // Strong indicators
  { pattern: /^[\t ]*#{1,6}\s+\S/m, score: 3, type: "header" },
  { pattern: /^`{3}.*\n[\s\S]*?\n`{3}$/m, score: 3, type: "codeblock" },
  { pattern: /^\s*[-*+]\s+\[[ x]\]/m, score: 3, type: "tasklist" },
  // Medium indicators
  { pattern: /\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 2, type: "link" },
  { pattern: /^\s*\[[^\]]+\]:\s+\S+/m, score: 2, type: "reference" },
  { pattern: /\[[^\]]+\]\[\w*\]/, score: 2, type: "referenceLink" },
  { pattern: /^[\t ]*>\s+.+/m, score: 2, type: "blockquote" },
  { pattern: /^[-*_]{3,}/m, score: 2, type: "hr" },
  { pattern: /^\|.+\|.+\|$/m, score: 2, type: "table" },
  { pattern: /^\s{0,3}[-*+]\s+\S/m, score: 2, type: "unorderedList" },
  { pattern: /^\s*\d+\.\s+\S/m, score: 2, type: "orderedList" },
  { pattern: /\$\$.+\$\$/m, score: 2, type: "math" },
  { pattern: /\{:.+\}/m, score: 2, type: "attribute" },
  { pattern: /\[:(.+?)\]/, score: 2, type: "footnote" },
  // Weak indicators
  { pattern: /(?<!\*)\*\*[^*\n]+\*\*(?!\*)/m, score: 1, type: "bold" },
  { pattern: /(?<!\*)\*[^*\n]+\*(?!\*)/m, score: 1, type: "italic" },
  { pattern: /(?<!_)__[^_\n]+__(?!_)/m, score: 1, type: "boldUnderscore" },
  { pattern: /(?<!_)_[^_\n]+_(?!_)/m, score: 1, type: "italicUnderscore" },
  { pattern: /`[^`\n]+`/m, score: 1, type: "inlineCode" },
  { pattern: /~~[^~\n]+~~/m, score: 1, type: "strikethrough" },
  { pattern: /!\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 1, type: "image" }
 ];
 const NEGATIVE_PATTERNS: MarkdownPattern[] = [
  { pattern: /<html>/, score: -5, type: "html" },
  { pattern: /<?xml/, score: -5, type: "xml" },
  { pattern: /^\s*[0-9,.]+$/, score: -3, type: "numbers" },
  {
    pattern: /^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/,
    score: -5,
    type: "email"
  },
  { pattern: /^https?:\/\/\S+$/, score: -3, type: "url" },
  { pattern: /^\s*[{[\]},;]\s*$/, score: -3, type: "punctuation" },
  { pattern: /^(Subject|From|To|Date):/, score: -4, type: "emailHeader" },
  { pattern: /^[A-Z]{2,}:/, score: -2, type: "capsHeader" },
  { pattern: /<script[\s\S]*?<\/script>/i, score: -4, type: "script" },
  { pattern: /<style[\s\S]*?<\/style>/i, score: -4, type: "style" }
 ];
 export function isProbablyMarkdown(text: string, debug = false) {
  function log(...args: any[]) {
    if (debug) console.log(...args);
  }
  // Check definite patterns first
  const definiteMatch = DEFINITE_PATTERNS.find((p) => p.pattern.test(text));
  if (definiteMatch) {
    log("Definite markdown match:", definiteMatch.type);
    return true;
  }
  let score = 0;
  const matches: string[] = [];
  const lines = text.split("\n");
  function pushMatch(match: string) {
    if (debug) matches.push(match);
  }
  // Check positive patterns
  for (const pattern of MARKDOWN_PATTERNS) {
    const matchCount = getPatternMatches(text, pattern.pattern);
    if (matchCount > 0) {
      const patternScore = pattern.score * matchCount;
      score += patternScore;
      pushMatch(`+${patternScore} ${pattern.type} (${matchCount} matches)`);
    }
  }
  // Check negative patterns
  for (const pattern of NEGATIVE_PATTERNS) {
    const matchCount = getPatternMatches(text, pattern.pattern);
    if (matchCount > 0) {
      const patternScore = pattern.score * matchCount;
      score += patternScore;
      pushMatch(`${patternScore} ${pattern.type} (${matchCount} matches)`);
    }
  }
  // Structure indicators
  if (lines.length > 1) {
    score += 1;
    pushMatch("+1 multiline");
  }
  if (/\n\n/.test(text)) {
    score += 1;
    pushMatch("+1 paragraphs");
  }
  if (/^\s{1,4}[^\s]/m.test(text)) {
    score += 1;
    pushMatch("+1 indentation");
  }
  // Check formatting consistency
  let consistentFormatting = 0;
  let prevLineIndent = -1;
  for (const line of lines) {
    const indent = line.search(/\S/);
    if (prevLineIndent !== -1) {
      if (indent === prevLineIndent || indent === prevLineIndent + 2) {
        consistentFormatting++;
      }
    }
    prevLineIndent = indent;
  }
  if (consistentFormatting > lines.length / 2) {
    score += 2;
    pushMatch("+2 consistentFormatting");
  }
  const threshold = text.length > 100 ? 4 : 3;
  const confidence = Math.min(
    100,
    Math.max(0, score * (text.length > 100 ? 8 : 12))
  );
  if (debug) {
    const result = {
      score,
      isLikelyMarkdown: score > threshold,
      confidence,
      details: {
        length: text.length,
        lines: lines.length,
        consistentFormatting,
        threshold,
        matches
      }
    };
    log("Markdown detection result:", result);
  }
  return score > threshold;
 }
 function getPatternMatches(text: string, pattern: RegExp) {
  const matches = text.match(new RegExp(pattern, "gm")) || [];
  return matches.length;
 }
--- a/packages/editor/src/extensions/clipboard/tests/clipboard-text-parser.test.ts
+++ b/packages/editor/src/extensions/clipboard/tests/clipboard-text-parser.test.ts
@@ -18,16 +18,240 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 import { test } from "vitest";
-import { convertTextToHTML } from "../clipboard-text-parser.js";
+import {
  convertTextToHTML,
  isProbablyMarkdown
 } from "../clipboard-text-parser.js";
-const cases = [
+const textCases = [
  `Hello\r\nworld`,
  `What\n\n\nNO!`,
  `Hello\r\nWorld\nwhat?\nNay!`
 ];
-for (const testCase of cases) {
+for (const testCase of textCases) {
  test(`convert text to html`, (t) => {
    t.expect(convertTextToHTML(testCase)).toMatchSnapshot();
  });
 }
 const markdownCases = [
  // Single-line markdown elements
  {
    text: "# Header",
    isMarkdown: true
  },
  {
    text: "## Second level",
    isMarkdown: true
  },
  {
    text: "#Not a header",
    isMarkdown: false
  },
  {
    text: "- [ ] Task",
    isMarkdown: true
  },
  {
    text: "- [x] Completed task",
    isMarkdown: true
  },
  {
    text: "- []Not a task",
    isMarkdown: false
  },
  // Tables
  {
    text: "|Column 1|Column 2|",
    isMarkdown: true
  },
  {
    text: "| Name | Age |",
    isMarkdown: true
  },
  {
    text: "|Not|A|Table",
    isMarkdown: false
  },
  // Blockquotes
  {
    text: "> Quoted text",
    isMarkdown: true
  },
  {
    text: ">Not a quote",
    isMarkdown: false
  },
  // Multi-line content
  {
    text: `# Header
  Some paragraph text
  - List item 1
  - List item 2`,
    isMarkdown: true
  },
  {
    text: `\`\`\`javascript
  const x = 1;
  console.log(x);
  \`\`\``,
    isMarkdown: true
  },
  // Inline formatting
  {
    text: "This is **bold** text with some other **bold** content and some *italic* text and some `code` text and ~~strikethough~~ is ~~awesome~~ just okey!",
    isMarkdown: true
  },
  // Lists
  {
    text: `- Item 1
  - Item 2`,
    isMarkdown: true
  },
  {
    text: `1. First
  2. Second`,
    isMarkdown: true
  },
  // Plain text (negative cases)
  {
    text: "Just plain text",
    isMarkdown: false
  },
  {
    text: "Hello world",
    isMarkdown: false
  },
  {
    text: "12345",
    isMarkdown: false
  },
  // HTML (negative cases)
  {
    text: "<html><body>Test</body></html>",
    isMarkdown: false
  },
  {
    text: "<div>Content</div>",
    isMarkdown: false
  },
  // Special cases (negative)
  {
    text: "test@example.com",
    isMarkdown: false
  },
  {
    text: "https://example.com",
    isMarkdown: false
  },
  // Edge cases
  {
    text: "",
    isMarkdown: false
  },
  {
    text: " ",
    isMarkdown: false
  },
  {
    text: "*",
    isMarkdown: false
  },
  {
    text: "#",
    isMarkdown: false
  },
  // Mixed content
  {
    text: `# Header
  Regular paragraph with **bold** and *italic*.
  \`\`\`
  code block
  \`\`\`
  1. List item
  2. Another item
  > Blockquote`,
    isMarkdown: true
  },
  // Ambiguous cases
  {
    text: "2 * 3 = 6",
    isMarkdown: false
  },
  {
    text: "c:\\path\\to\\file",
    isMarkdown: false
  },
  {
    text: "From: user@example.com",
    isMarkdown: false
  },
  // Formatting combinations
  {
    text: "[Link](https://example.com) with some **bold** text and other _stuff_ **Bold _and italic_**.",
    isMarkdown: true
  },
  {
    text: "![Image](image.jpg)",
    isMarkdown: true
  },
  // Reference-style links
  {
    text: `[link][1]
  [1]: https://example.com`,
    isMarkdown: true
  },
  // Whitespace variations
  {
    text: "  # Header with spaces",
    isMarkdown: true
  },
  {
    text: "\t> Tabbed quote",
    isMarkdown: true
  },
  // Common user input patterns
  {
    text: "Hello\nWorld",
    isMarkdown: false
  },
  {
    text: "Item one\nItem two\nItem three",
    isMarkdown: false
  },
  {
    text: "YES",
    isMarkdown: false
  },
  {
    text: "OK",
    isMarkdown: false
  }
 ];
 for (const testCase of markdownCases) {
  test(`detect as markdown ${testCase.text}`, (t) => {
    t.expect(isProbablyMarkdown(testCase.text)).toBe(testCase.isMarkdown);
  });
 }