diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js index e76c6c8a0e..cd4a75d69f 100644 --- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js +++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js @@ -10,6 +10,7 @@ import { CodeBlockSyntaxHighlight } from '../extensions/CodeBlockSyntaxHighlight import { CustomLink } from '../extensions/Link'; import { Math } from '../extensions/Math'; import { createCustomMarkdownSerializer } from '../utils/markdownSerializer'; +import { transformPastedHTML } from '../utils/pasteTransform'; export function useEditor() { const editor = ref(null); @@ -42,6 +43,7 @@ export function useEditor() { class: 'prose prose-sm sm:prose lg:prose-lg xl:prose-2xl focus:outline-none', dir: 'auto', }, + transformPastedHTML: html => transformPastedHTML(html), }, onCreate: () => { isReady.value = true; diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js index 9b77b3deaa..27215c97bd 100644 --- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js +++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js @@ -1,6 +1,6 @@ import { computed, inject } from 'vue'; import { getTipTapEditorStrings } from '../TipTapEditorStrings'; -import { sanitizePastedHTML } from '../utils/markdown'; +import { transformPastedHTML } from '../utils/pasteTransform'; export function useToolbarActions(emit) { const editor = inject('editor', null); @@ -165,7 +165,7 @@ export function useToolbarActions(emit) { if (item.types.includes('text/html')) { const htmlBlob = await item.getType('text/html'); const html = await htmlBlob.text(); - const cleaned = sanitizePastedHTML(html); + const cleaned = transformPastedHTML(html); editor.value.chain().focus().insertContent(cleaned).run(); return; diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js index efe1adc994..a364942e39 100644 --- a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js +++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js @@ -58,67 +58,6 @@ export const paramsToMathMd = ({ latex }) => { return `$$${latex || ''}$$`; }; -export function sanitizePastedHTML(html) { - if (!html) return ''; - // This code ine 55 to 66 is geneted with the help of LLM with the prompt - // "Create a function that sanitizes HTML pasted from Microsoft - // Word by removing Word-specific tags, styles, and classes while preserving other formatting." - let cleaned = html; - cleaned = cleaned.replace(//gis, ''); - cleaned = cleaned.replace(/<\/?(w|m|o|v):[^>]*>/gis, ''); - const parser = new DOMParser(); - const doc = parser.parseFromString(cleaned, 'text/html'); - doc.querySelectorAll('*').forEach(el => { - if (el.hasAttribute('style')) { - const style = el.getAttribute('style') || ''; - const filtered = style - .split(';') - .map(s => s.trim()) - .filter(s => s && !s.toLowerCase().startsWith('mso-')) - .join('; '); - if (filtered) { - el.setAttribute('style', filtered); - } else { - el.removeAttribute('style'); - } - } - if (el.hasAttribute('class')) { - const cls = el - .getAttribute('class') - .split(/\s+/) - .filter(c => c && !/^Mso/i.test(c)) - .join(' '); - if (cls) { - el.setAttribute('class', cls); - } else { - el.removeAttribute('class'); - } - } - }); - const strikeElements = doc.querySelectorAll('s, strike, del'); - strikeElements.forEach(el => { - const nestedLists = el.querySelectorAll('ul, ol'); - if (nestedLists.length > 0) { - nestedLists.forEach(list => { - el.parentNode.insertBefore(list, el.nextSibling); - }); - } - }); - const lists = doc.querySelectorAll('ul, ol'); - lists.forEach(list => { - const items = list.querySelectorAll(':scope > li'); - items.forEach(item => { - const nestedLists = Array.from(item.children).filter( - child => child.tagName === 'UL' || child.tagName === 'OL', - ); - nestedLists.forEach(nestedList => { - item.appendChild(nestedList); - }); - }); - }); - return doc.body.innerHTML; -} - /** * Pre-processes a raw Markdown string to convert custom syntax into HTML tags * that Tiptap's extensions can understand. This is our custom "loader". diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js new file mode 100644 index 0000000000..8f805aa0e9 --- /dev/null +++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/pasteTransform.js @@ -0,0 +1,83 @@ +function stripMsoConditionalComments(html) { + return html.replace(//gis, ''); +} + +function stripOfficeNamespacedTags(html) { + return html.replace(/<\/?(w|m|o|v):[^>]*>/gis, ''); +} + +function filterMsoStyleDeclarations(doc) { + doc.querySelectorAll('[style]').forEach(el => { + const filtered = el + .getAttribute('style') + .split(';') + .map(s => s.trim()) + .filter(s => s && !s.toLowerCase().startsWith('mso-')) + .join('; '); + if (filtered) { + el.setAttribute('style', filtered); + } else { + el.removeAttribute('style'); + } + }); +} + +function filterMsoClasses(doc) { + doc.querySelectorAll('[class]').forEach(el => { + const cls = el + .getAttribute('class') + .split(/\s+/) + .filter(c => c && !/^Mso/i.test(c)) + .join(' '); + if (cls) { + el.setAttribute('class', cls); + } else { + el.removeAttribute('class'); + } + }); +} + +function hoistListsOutOfStrike(doc) { + doc.querySelectorAll('s, strike, del').forEach(el => { + el.querySelectorAll('ul, ol').forEach(list => { + el.parentNode.insertBefore(list, el.nextSibling); + }); + }); +} + +function reparentNestedListsInLi(doc) { + doc.querySelectorAll('ul, ol').forEach(list => { + list.querySelectorAll(':scope > li').forEach(item => { + Array.from(item.children) + .filter(child => child.tagName === 'UL' || child.tagName === 'OL') + .forEach(nestedList => item.appendChild(nestedList)); + }); + }); +} + +function stripImages(doc) { + doc.querySelectorAll('img').forEach(el => el.remove()); +} + +const STRING_TRANSFORMS = [stripMsoConditionalComments, stripOfficeNamespacedTags]; + +const DOM_TRANSFORMS = [ + filterMsoStyleDeclarations, + filterMsoClasses, + hoistListsOutOfStrike, + reparentNestedListsInLi, + stripImages, +]; + +export function transformPastedHTML(html) { + if (!html) return ''; + let cleaned = html; + for (const transform of STRING_TRANSFORMS) { + cleaned = transform(cleaned); + } + const doc = new DOMParser().parseFromString(cleaned, 'text/html'); + for (const transform of DOM_TRANSFORMS) { + transform(doc); + } + return doc.body.innerHTML; +} diff --git a/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js new file mode 100644 index 0000000000..3f44d938cf --- /dev/null +++ b/contentcuration/contentcuration/frontend/shared/views/TipTapEditor/__tests__/pasteTransform.spec.js @@ -0,0 +1,136 @@ +import { transformPastedHTML } from '../TipTapEditor/utils/pasteTransform'; + +describe('transformPastedHTML', () => { + describe('empty inputs', () => { + it('returns empty string for empty input', () => { + expect(transformPastedHTML('')).toBe(''); + }); + + it('returns empty string for null', () => { + expect(transformPastedHTML(null)).toBe(''); + }); + + it('returns empty string for undefined', () => { + expect(transformPastedHTML(undefined)).toBe(''); + }); + }); + + describe('image stripping', () => { + it('strips a single remote img', () => { + const input = '
before
after
before after
'); + }); + + it('strips a data: URI img', () => { + const input = '
'],
+ ['blob', '
'],
+ ['relative', '
'],
+ ])('strips img with %s scheme', (_scheme, imgTag) => {
+ expect(transformPastedHTML(`${imgTag}
`)).toBe(''); + }); + + it('strips multiple imgs in different parents', () => { + const input = [ + 'top
bold italic link
before after
'; + expect(transformPastedHTML(input)).toBe('before after
'); + }); + + it('removes Office-namespaced tags (w:, m:, o:, v:)', () => { + const input = + 'before
x
'; + const output = transformPastedHTML(input); + expect(output).not.toMatch(/mso-/); + expect(output).toContain('color: red'); + expect(output).toContain('font-size: 12pt'); + }); + + it('removes the style attribute entirely when all declarations were mso-*', () => { + const input = 'x
'; + expect(transformPastedHTML(input)).toBe('x
'); + }); + + it('strips Mso* classes (case-insensitive) while keeping other classes', () => { + const input = 'x
'; + const output = transformPastedHTML(input); + expect(output).toContain('class="kept-class"'); + expect(output).not.toMatch(/Mso/i); + }); + + it('removes the class attribute entirely when all classes were Mso*', () => { + const input = 'x
'; + expect(transformPastedHTML(input)).toBe('x
'); + }); + + it('hoists nested lists out of strike/s/del wrappers', () => { + const input = 'plain text
'], + ['before after
x
'], + ['x
'], + ['y
'], + ])('is idempotent: f(f(x)) === f(x) for %s', input => { + const once = transformPastedHTML(input); + const twice = transformPastedHTML(once); + expect(twice).toBe(once); + }); + }); +});