editor: add support for pasting as markdown

2025-12-16 11:47:54 +01:00 · 2025-02-04 13:39:19 +05:00
parent 49ffcbea03
commit c16bbcaad5
5 changed files with 3324 additions and 23 deletions
--- a/packages/editor/package-lock.json
+++ b/packages/editor/package-lock.json
--- a/packages/editor/package.json
+++ b/packages/editor/package.json
@@ -30,6 +30,7 @@
  },
  "license": "GPL-3.0-or-later",
  "dependencies": {
+    "@notesnook-importer/core": "^2.1.1",
    "@notesnook/common": "file:../common",
    "@notesnook/intl": "file:../intl",
    "@notesnook/theme": "file:../theme",
--- a/packages/editor/src/extensions/clipboard/clipboard-dom-parser.ts
+++ b/packages/editor/src/extensions/clipboard/clipboard-dom-parser.ts
@@ -22,7 +22,7 @@ import {
  ParseOptions
 } from "@tiptap/pm/model";
 import { encodeNonAsciiHTML } from "entities";
-import { Schema, Slice } from "prosemirror-model";
+import { Node as PMNode, Schema, Slice } from "prosemirror-model";
 import { inferLanguage } from "../code-block/index.js";
 import { hasPermission } from "../../types.js";

@@ -47,6 +47,17 @@ export class ClipboardDOMParser extends ProsemirrorDOMParser {
    }
    return super.parseSlice(dom, options);
  }
+
+  parse(dom: Node, options?: ParseOptions): PMNode {
+    if (dom instanceof HTMLElement || dom instanceof Document) {
+      convertGoogleDocsChecklist(dom);
+      formatCodeblocks(dom);
+      convertBrToSingleSpacedParagraphs(dom);
+      removeImages(dom);
+      removeBlockId(dom);
+    }
+    return super.parse(dom, options);
+  }
 }

 export function removeBlockId(dom: HTMLElement | Document) {
--- a/packages/editor/src/extensions/clipboard/clipboard-text-parser.ts
+++ b/packages/editor/src/extensions/clipboard/clipboard-text-parser.ts
@@ -21,19 +21,34 @@ import { ResolvedPos, Slice } from "@tiptap/pm/model";
 import { encodeNonAsciiHTML } from "entities";
 import { ClipboardDOMParser } from "./clipboard-dom-parser.js";
 import { EditorView } from "@tiptap/pm/view";
+import { markdowntoHTML } from "@notesnook-importer/core/dist/src/utils/to-html.js";

 export function clipboardTextParser(
  text: string,
-  _$context: ResolvedPos,
-  _plain: boolean,
+  $context: ResolvedPos,
+  plain: boolean,
  view: EditorView
 ): Slice {
+  if (!plain && isProbablyMarkdown(text)) {
+    const node = ClipboardDOMParser.fromSchema(view.state.schema).parse(
+      new DOMParser().parseFromString(
+        markdowntoHTML(text, { allowDangerousHtml: false }),
+        "text/html"
+      ),
+      {
+        context: $context
+      }
+    );
+    return node.slice(0);
+  }
+
  const doc = new DOMParser().parseFromString(
    convertTextToHTML(text),
    "text/html"
  );
  return ClipboardDOMParser.fromSchema(view.state.schema).parseSlice(doc, {
-    preserveWhitespace: "full"
+    preserveWhitespace: "full",
+    context: $context
  });
 }

@@ -58,3 +73,166 @@ function encodeLine(line: string) {
  });
  return line;
 }
+
+interface MarkdownPattern {
+  pattern: RegExp;
+  score: number;
+  type: string;
+}
+
+const DEFINITE_PATTERNS: MarkdownPattern[] = [
+  { pattern: /^[\t ]*#{1,6}\s+\S/, score: 0, type: "header" },
+  { pattern: /^\s*[-*+]\s+\[[ x]\]/, score: 0, type: "task" },
+  { pattern: /^\|.+\|.+\|$/, score: 0, type: "table" },
+  { pattern: /^[\t ]*>\s+.+/, score: 0, type: "blockquote" },
+  { pattern: /!\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 0, type: "image" }
+];
+
+const MARKDOWN_PATTERNS: MarkdownPattern[] = [
+  // Strong indicators
+  { pattern: /^[\t ]*#{1,6}\s+\S/m, score: 3, type: "header" },
+  { pattern: /^`{3}.*\n[\s\S]*?\n`{3}$/m, score: 3, type: "codeblock" },
+  { pattern: /^\s*[-*+]\s+\[[ x]\]/m, score: 3, type: "tasklist" },
+
+  // Medium indicators
+  { pattern: /\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 2, type: "link" },
+  { pattern: /^\s*\[[^\]]+\]:\s+\S+/m, score: 2, type: "reference" },
+  { pattern: /\[[^\]]+\]\[\w*\]/, score: 2, type: "referenceLink" },
+  { pattern: /^[\t ]*>\s+.+/m, score: 2, type: "blockquote" },
+  { pattern: /^[-*_]{3,}/m, score: 2, type: "hr" },
+  { pattern: /^\|.+\|.+\|$/m, score: 2, type: "table" },
+  { pattern: /^\s{0,3}[-*+]\s+\S/m, score: 2, type: "unorderedList" },
+  { pattern: /^\s*\d+\.\s+\S/m, score: 2, type: "orderedList" },
+  { pattern: /\$\$.+\$\$/m, score: 2, type: "math" },
+  { pattern: /\{:.+\}/m, score: 2, type: "attribute" },
+  { pattern: /\[:(.+?)\]/, score: 2, type: "footnote" },
+
+  // Weak indicators
+  { pattern: /(?<!\*)\*\*[^*\n]+\*\*(?!\*)/m, score: 1, type: "bold" },
+  { pattern: /(?<!\*)\*[^*\n]+\*(?!\*)/m, score: 1, type: "italic" },
+  { pattern: /(?<!_)__[^_\n]+__(?!_)/m, score: 1, type: "boldUnderscore" },
+  { pattern: /(?<!_)_[^_\n]+_(?!_)/m, score: 1, type: "italicUnderscore" },
+  { pattern: /`[^`\n]+`/m, score: 1, type: "inlineCode" },
+  { pattern: /~~[^~\n]+~~/m, score: 1, type: "strikethrough" },
+
+  { pattern: /!\[[^\]]+\]\([^)\s]+(?:\s+"[^"]*")?\)/, score: 1, type: "image" }
+];
+
+const NEGATIVE_PATTERNS: MarkdownPattern[] = [
+  { pattern: /<html>/, score: -5, type: "html" },
+  { pattern: /<?xml/, score: -5, type: "xml" },
+  { pattern: /^\s*[0-9,.]+$/, score: -3, type: "numbers" },
+  {
+    pattern: /^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/,
+    score: -5,
+    type: "email"
+  },
+  { pattern: /^https?:\/\/\S+$/, score: -3, type: "url" },
+  { pattern: /^\s*[{[\]},;]\s*$/, score: -3, type: "punctuation" },
+  { pattern: /^(Subject|From|To|Date):/, score: -4, type: "emailHeader" },
+  { pattern: /^[A-Z]{2,}:/, score: -2, type: "capsHeader" },
+  { pattern: /<script[\s\S]*?<\/script>/i, score: -4, type: "script" },
+  { pattern: /<style[\s\S]*?<\/style>/i, score: -4, type: "style" }
+];
+
+export function isProbablyMarkdown(text: string, debug = false) {
+  function log(...args: any[]) {
+    if (debug) console.log(...args);
+  }
+
+  // Check definite patterns first
+  const definiteMatch = DEFINITE_PATTERNS.find((p) => p.pattern.test(text));
+  if (definiteMatch) {
+    log("Definite markdown match:", definiteMatch.type);
+    return true;
+  }
+
+  let score = 0;
+  const matches: string[] = [];
+  const lines = text.split("\n");
+
+  function pushMatch(match: string) {
+    if (debug) matches.push(match);
+  }
+
+  // Check positive patterns
+  for (const pattern of MARKDOWN_PATTERNS) {
+    const matchCount = getPatternMatches(text, pattern.pattern);
+    if (matchCount > 0) {
+      const patternScore = pattern.score * matchCount;
+      score += patternScore;
+      pushMatch(`+${patternScore} ${pattern.type} (${matchCount} matches)`);
+    }
+  }
+
+  // Check negative patterns
+  for (const pattern of NEGATIVE_PATTERNS) {
+    const matchCount = getPatternMatches(text, pattern.pattern);
+    if (matchCount > 0) {
+      const patternScore = pattern.score * matchCount;
+      score += patternScore;
+      pushMatch(`${patternScore} ${pattern.type} (${matchCount} matches)`);
+    }
+  }
+
+  // Structure indicators
+  if (lines.length > 1) {
+    score += 1;
+    pushMatch("+1 multiline");
+  }
+  if (/\n\n/.test(text)) {
+    score += 1;
+    pushMatch("+1 paragraphs");
+  }
+  if (/^\s{1,4}[^\s]/m.test(text)) {
+    score += 1;
+    pushMatch("+1 indentation");
+  }
+
+  // Check formatting consistency
+  let consistentFormatting = 0;
+  let prevLineIndent = -1;
+
+  for (const line of lines) {
+    const indent = line.search(/\S/);
+    if (prevLineIndent !== -1) {
+      if (indent === prevLineIndent || indent === prevLineIndent + 2) {
+        consistentFormatting++;
+      }
+    }
+    prevLineIndent = indent;
+  }
+
+  if (consistentFormatting > lines.length / 2) {
+    score += 2;
+    pushMatch("+2 consistentFormatting");
+  }
+
+  const threshold = text.length > 100 ? 4 : 3;
+  const confidence = Math.min(
+    100,
+    Math.max(0, score * (text.length > 100 ? 8 : 12))
+  );
+
+  if (debug) {
+    const result = {
+      score,
+      isLikelyMarkdown: score > threshold,
+      confidence,
+      details: {
+        length: text.length,
+        lines: lines.length,
+        consistentFormatting,
+        threshold,
+        matches
+      }
+    };
+    log("Markdown detection result:", result);
+  }
+  return score > threshold;
+}
+
+function getPatternMatches(text: string, pattern: RegExp) {
+  const matches = text.match(new RegExp(pattern, "gm")) || [];
+  return matches.length;
+}
--- a/packages/editor/src/extensions/clipboard/tests/clipboard-text-parser.test.ts
+++ b/packages/editor/src/extensions/clipboard/tests/clipboard-text-parser.test.ts
@@ -18,16 +18,240 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 import { test } from "vitest";
-import { convertTextToHTML } from "../clipboard-text-parser.js";
+import {
+  convertTextToHTML,
+  isProbablyMarkdown
+} from "../clipboard-text-parser.js";

-const cases = [
+const textCases = [
  `Hello\r\nworld`,
  `What\n\n\nNO!`,
  `Hello\r\nWorld\nwhat?\nNay!`
 ];

-for (const testCase of cases) {
+for (const testCase of textCases) {
  test(`convert text to html`, (t) => {
    t.expect(convertTextToHTML(testCase)).toMatchSnapshot();
  });
 }
+
+const markdownCases = [
+  // Single-line markdown elements
+  {
+    text: "# Header",
+    isMarkdown: true
+  },
+  {
+    text: "## Second level",
+    isMarkdown: true
+  },
+  {
+    text: "#Not a header",
+    isMarkdown: false
+  },
+  {
+    text: "- [ ] Task",
+    isMarkdown: true
+  },
+  {
+    text: "- [x] Completed task",
+    isMarkdown: true
+  },
+  {
+    text: "- []Not a task",
+    isMarkdown: false
+  },
+
+  // Tables
+  {
+    text: "|Column 1|Column 2|",
+    isMarkdown: true
+  },
+  {
+    text: "| Name | Age |",
+    isMarkdown: true
+  },
+  {
+    text: "|Not|A|Table",
+    isMarkdown: false
+  },
+
+  // Blockquotes
+  {
+    text: "> Quoted text",
+    isMarkdown: true
+  },
+  {
+    text: ">Not a quote",
+    isMarkdown: false
+  },
+
+  // Multi-line content
+  {
+    text: `# Header
+  Some paragraph text
+
+  - List item 1
+  - List item 2`,
+    isMarkdown: true
+  },
+  {
+    text: `\`\`\`javascript
+  const x = 1;
+  console.log(x);
+  \`\`\``,
+    isMarkdown: true
+  },
+
+  // Inline formatting
+  {
+    text: "This is **bold** text with some other **bold** content and some *italic* text and some `code` text and ~~strikethough~~ is ~~awesome~~ just okey!",
+    isMarkdown: true
+  },
+
+  // Lists
+  {
+    text: `- Item 1
+  - Item 2`,
+    isMarkdown: true
+  },
+  {
+    text: `1. First
+  2. Second`,
+    isMarkdown: true
+  },
+
+  // Plain text (negative cases)
+  {
+    text: "Just plain text",
+    isMarkdown: false
+  },
+  {
+    text: "Hello world",
+    isMarkdown: false
+  },
+  {
+    text: "12345",
+    isMarkdown: false
+  },
+
+  // HTML (negative cases)
+  {
+    text: "<html><body>Test</body></html>",
+    isMarkdown: false
+  },
+  {
+    text: "<div>Content</div>",
+    isMarkdown: false
+  },
+
+  // Special cases (negative)
+  {
+    text: "test@example.com",
+    isMarkdown: false
+  },
+  {
+    text: "https://example.com",
+    isMarkdown: false
+  },
+
+  // Edge cases
+  {
+    text: "",
+    isMarkdown: false
+  },
+  {
+    text: " ",
+    isMarkdown: false
+  },
+  {
+    text: "*",
+    isMarkdown: false
+  },
+  {
+    text: "#",
+    isMarkdown: false
+  },
+
+  // Mixed content
+  {
+    text: `# Header
+  Regular paragraph with **bold** and *italic*.
+
+  \`\`\`
+  code block
+  \`\`\`
+
+  1. List item
+  2. Another item
+
+  > Blockquote`,
+    isMarkdown: true
+  },
+
+  // Ambiguous cases
+  {
+    text: "2 * 3 = 6",
+    isMarkdown: false
+  },
+  {
+    text: "c:\\path\\to\\file",
+    isMarkdown: false
+  },
+  {
+    text: "From: user@example.com",
+    isMarkdown: false
+  },
+
+  // Formatting combinations
+
+  {
+    text: "[Link](https://example.com) with some **bold** text and other _stuff_ **Bold _and italic_**.",
+    isMarkdown: true
+  },
+  {
+    text: "![Image](image.jpg)",
+    isMarkdown: true
+  },
+
+  // Reference-style links
+  {
+    text: `[link][1]
+  [1]: https://example.com`,
+    isMarkdown: true
+  },
+
+  // Whitespace variations
+  {
+    text: "  # Header with spaces",
+    isMarkdown: true
+  },
+  {
+    text: "\t> Tabbed quote",
+    isMarkdown: true
+  },
+
+  // Common user input patterns
+  {
+    text: "Hello\nWorld",
+    isMarkdown: false
+  },
+  {
+    text: "Item one\nItem two\nItem three",
+    isMarkdown: false
+  },
+  {
+    text: "YES",
+    isMarkdown: false
+  },
+  {
+    text: "OK",
+    isMarkdown: false
+  }
+];
+
+for (const testCase of markdownCases) {
+  test(`detect as markdown ${testCase.text}`, (t) => {
+    t.expect(isProbablyMarkdown(testCase.text)).toBe(testCase.isMarkdown);
+  });
+}