From ebc5e8715a78b4d963672ff7350bc18e4d93d3e4 Mon Sep 17 00:00:00 2001
From: Richard Tibbles before before after ${imgTag} top tags from pasted HTML
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Pasted
tags survived into the editor doc and the saved markdown.
Remote image references then blocked channel publish (#5895).
Adds a single sync transformer (transformPastedHTML) wired into both
native paste (via editorProps.transformPastedHTML) and the toolbar
Paste button. Existing Word/Office sanitization is preserved and now
applies to both paths — previously only the toolbar ran it.
Fixes #5895
---
.../TipTapEditor/composables/useEditor.js | 2 +
.../composables/useToolbarActions.js | 4 +-
.../TipTapEditor/utils/markdown.js | 61 --------
.../TipTapEditor/utils/pasteTransform.js | 83 +++++++++++
.../__tests__/pasteTransform.spec.js | 136 ++++++++++++++++++
5 files changed, 223 insertions(+), 63 deletions(-)
create mode 100644 contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js
create mode 100644 contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js
diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js
index e76c6c8a0e..cd4a75d69f 100644
--- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js
+++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js
@@ -10,6 +10,7 @@ import { CodeBlockSyntaxHighlight } from '../extensions/CodeBlockSyntaxHighlight
import { CustomLink } from '../extensions/Link';
import { Math } from '../extensions/Math';
import { createCustomMarkdownSerializer } from '../utils/markdownSerializer';
+import { transformPastedHTML } from '../utils/pasteTransform';
export function useEditor() {
const editor = ref(null);
@@ -42,6 +43,7 @@ export function useEditor() {
class: 'prose prose-sm sm:prose lg:prose-lg xl:prose-2xl focus:outline-none',
dir: 'auto',
},
+ transformPastedHTML: html => transformPastedHTML(html),
},
onCreate: () => {
isReady.value = true;
diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js
index 9b77b3deaa..27215c97bd 100644
--- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js
+++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js
@@ -1,6 +1,6 @@
import { computed, inject } from 'vue';
import { getTipTapEditorStrings } from '../TipTapEditorStrings';
-import { sanitizePastedHTML } from '../utils/markdown';
+import { transformPastedHTML } from '../utils/pasteTransform';
export function useToolbarActions(emit) {
const editor = inject('editor', null);
@@ -165,7 +165,7 @@ export function useToolbarActions(emit) {
if (item.types.includes('text/html')) {
const htmlBlob = await item.getType('text/html');
const html = await htmlBlob.text();
- const cleaned = sanitizePastedHTML(html);
+ const cleaned = transformPastedHTML(html);
editor.value.chain().focus().insertContent(cleaned).run();
return;
diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js
index efe1adc994..a364942e39 100644
--- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js
+++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js
@@ -58,67 +58,6 @@ export const paramsToMathMd = ({ latex }) => {
return `$$${latex || ''}$$`;
};
-export function sanitizePastedHTML(html) {
- if (!html) return '';
- // This code ine 55 to 66 is geneted with the help of LLM with the prompt
- // "Create a function that sanitizes HTML pasted from Microsoft
- // Word by removing Word-specific tags, styles, and classes while preserving other formatting."
- let cleaned = html;
- cleaned = cleaned.replace(//gis, '');
- cleaned = cleaned.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
- const parser = new DOMParser();
- const doc = parser.parseFromString(cleaned, 'text/html');
- doc.querySelectorAll('*').forEach(el => {
- if (el.hasAttribute('style')) {
- const style = el.getAttribute('style') || '';
- const filtered = style
- .split(';')
- .map(s => s.trim())
- .filter(s => s && !s.toLowerCase().startsWith('mso-'))
- .join('; ');
- if (filtered) {
- el.setAttribute('style', filtered);
- } else {
- el.removeAttribute('style');
- }
- }
- if (el.hasAttribute('class')) {
- const cls = el
- .getAttribute('class')
- .split(/\s+/)
- .filter(c => c && !/^Mso/i.test(c))
- .join(' ');
- if (cls) {
- el.setAttribute('class', cls);
- } else {
- el.removeAttribute('class');
- }
- }
- });
- const strikeElements = doc.querySelectorAll('s, strike, del');
- strikeElements.forEach(el => {
- const nestedLists = el.querySelectorAll('ul, ol');
- if (nestedLists.length > 0) {
- nestedLists.forEach(list => {
- el.parentNode.insertBefore(list, el.nextSibling);
- });
- }
- });
- const lists = doc.querySelectorAll('ul, ol');
- lists.forEach(list => {
- const items = list.querySelectorAll(':scope > li');
- items.forEach(item => {
- const nestedLists = Array.from(item.children).filter(
- child => child.tagName === 'UL' || child.tagName === 'OL',
- );
- nestedLists.forEach(nestedList => {
- item.appendChild(nestedList);
- });
- });
- });
- return doc.body.innerHTML;
-}
-
/**
* Pre-processes a raw Markdown string to convert custom syntax into HTML tags
* that Tiptap's extensions can understand. This is our custom "loader".
diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js
new file mode 100644
index 0000000000..8f805aa0e9
--- /dev/null
+++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js
@@ -0,0 +1,83 @@
+function stripMsoConditionalComments(html) {
+ return html.replace(//gis, '');
+}
+
+function stripOfficeNamespacedTags(html) {
+ return html.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
+}
+
+function filterMsoStyleDeclarations(doc) {
+ doc.querySelectorAll('[style]').forEach(el => {
+ const filtered = el
+ .getAttribute('style')
+ .split(';')
+ .map(s => s.trim())
+ .filter(s => s && !s.toLowerCase().startsWith('mso-'))
+ .join('; ');
+ if (filtered) {
+ el.setAttribute('style', filtered);
+ } else {
+ el.removeAttribute('style');
+ }
+ });
+}
+
+function filterMsoClasses(doc) {
+ doc.querySelectorAll('[class]').forEach(el => {
+ const cls = el
+ .getAttribute('class')
+ .split(/\s+/)
+ .filter(c => c && !/^Mso/i.test(c))
+ .join(' ');
+ if (cls) {
+ el.setAttribute('class', cls);
+ } else {
+ el.removeAttribute('class');
+ }
+ });
+}
+
+function hoistListsOutOfStrike(doc) {
+ doc.querySelectorAll('s, strike, del').forEach(el => {
+ el.querySelectorAll('ul, ol').forEach(list => {
+ el.parentNode.insertBefore(list, el.nextSibling);
+ });
+ });
+}
+
+function reparentNestedListsInLi(doc) {
+ doc.querySelectorAll('ul, ol').forEach(list => {
+ list.querySelectorAll(':scope > li').forEach(item => {
+ Array.from(item.children)
+ .filter(child => child.tagName === 'UL' || child.tagName === 'OL')
+ .forEach(nestedList => item.appendChild(nestedList));
+ });
+ });
+}
+
+function stripImages(doc) {
+ doc.querySelectorAll('img').forEach(el => el.remove());
+}
+
+const STRING_TRANSFORMS = [stripMsoConditionalComments, stripOfficeNamespacedTags];
+
+const DOM_TRANSFORMS = [
+ filterMsoStyleDeclarations,
+ filterMsoClasses,
+ hoistListsOutOfStrike,
+ reparentNestedListsInLi,
+ stripImages,
+];
+
+export function transformPastedHTML(html) {
+ if (!html) return '';
+ let cleaned = html;
+ for (const transform of STRING_TRANSFORMS) {
+ cleaned = transform(cleaned);
+ }
+ const doc = new DOMParser().parseFromString(cleaned, 'text/html');
+ for (const transform of DOM_TRANSFORMS) {
+ transform(doc);
+ }
+ return doc.body.innerHTML;
+}
diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js
new file mode 100644
index 0000000000..3f44d938cf
--- /dev/null
+++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js
@@ -0,0 +1,136 @@
+import { transformPastedHTML } from '../TipTapEditor/utils/pasteTransform';
+
+describe('transformPastedHTML', () => {
+ describe('empty inputs', () => {
+ it('returns empty string for empty input', () => {
+ expect(transformPastedHTML('')).toBe('');
+ });
+
+ it('returns empty string for null', () => {
+ expect(transformPastedHTML(null)).toBe('');
+ });
+
+ it('returns empty string for undefined', () => {
+ expect(transformPastedHTML(undefined)).toBe('');
+ });
+ });
+
+ describe('image stripping', () => {
+ it('strips a single remote img', () => {
+ const input = '
after
'],
+ ['blob', ''],
+ ['file', '
'],
+ ['relative', '
'],
+ ])('strips img with %s scheme', (_scheme, imgTag) => {
+ expect(transformPastedHTML(`',
+ '
',
+ ].join('');
+ const output = transformPastedHTML(input);
+ expect(output).not.toContain(' item
top
bold italic link
before after
'; + expect(transformPastedHTML(input)).toBe('before after
'); + }); + + it('removes Office-namespaced tags (w:, m:, o:, v:)', () => { + const input = + 'before
x
'; + const output = transformPastedHTML(input); + expect(output).not.toMatch(/mso-/); + expect(output).toContain('color: red'); + expect(output).toContain('font-size: 12pt'); + }); + + it('removes the style attribute entirely when all declarations were mso-*', () => { + const input = 'x
'; + expect(transformPastedHTML(input)).toBe('x
'); + }); + + it('strips Mso* classes (case-insensitive) while keeping other classes', () => { + const input = 'x
'; + const output = transformPastedHTML(input); + expect(output).toContain('class="kept-class"'); + expect(output).not.toMatch(/Mso/i); + }); + + it('removes the class attribute entirely when all classes were Mso*', () => { + const input = 'x
'; + expect(transformPastedHTML(input)).toBe('x
'); + }); + + it('hoists nested lists out of strike/s/del wrappers', () => { + const input = 'plain text
'], + ['before after
x
'], + ['x
'], + ['y
'], + ])('is idempotent: f(f(x)) === f(x) for %s', input => { + const once = transformPastedHTML(input); + const twice = transformPastedHTML(once); + expect(twice).toBe(once); + }); + }); +});