Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { CodeBlockSyntaxHighlight } from '../extensions/CodeBlockSyntaxHighlight
import { CustomLink } from '../extensions/Link';
import { Math } from '../extensions/Math';
import { createCustomMarkdownSerializer } from '../utils/markdownSerializer';
import { transformPastedHTML } from '../utils/pasteTransform';

export function useEditor() {
const editor = ref(null);
Expand Down Expand Up @@ -42,6 +43,7 @@ export function useEditor() {
class: 'prose prose-sm sm:prose lg:prose-lg xl:prose-2xl focus:outline-none',
dir: 'auto',
},
transformPastedHTML: html => transformPastedHTML(html),
},
onCreate: () => {
isReady.value = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { computed, inject } from 'vue';
import { getTipTapEditorStrings } from '../TipTapEditorStrings';
import { sanitizePastedHTML } from '../utils/markdown';
import { transformPastedHTML } from '../utils/pasteTransform';

export function useToolbarActions(emit) {
const editor = inject('editor', null);
Expand Down Expand Up @@ -165,7 +165,7 @@ export function useToolbarActions(emit) {
if (item.types.includes('text/html')) {
const htmlBlob = await item.getType('text/html');
const html = await htmlBlob.text();
const cleaned = sanitizePastedHTML(html);
const cleaned = transformPastedHTML(html);

editor.value.chain().focus().insertContent(cleaned).run();
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,67 +58,6 @@ export const paramsToMathMd = ({ latex }) => {
return `$$${latex || ''}$$`;
};

export function sanitizePastedHTML(html) {
if (!html) return '';
// This code ine 55 to 66 is geneted with the help of LLM with the prompt
// "Create a function that sanitizes HTML pasted from Microsoft
// Word by removing Word-specific tags, styles, and classes while preserving other formatting."
let cleaned = html;
cleaned = cleaned.replace(/<!--\[if.*?endif\]-->/gis, '');
cleaned = cleaned.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
const parser = new DOMParser();
const doc = parser.parseFromString(cleaned, 'text/html');
doc.querySelectorAll('*').forEach(el => {
if (el.hasAttribute('style')) {
const style = el.getAttribute('style') || '';
const filtered = style
.split(';')
.map(s => s.trim())
.filter(s => s && !s.toLowerCase().startsWith('mso-'))
.join('; ');
if (filtered) {
el.setAttribute('style', filtered);
} else {
el.removeAttribute('style');
}
}
if (el.hasAttribute('class')) {
const cls = el
.getAttribute('class')
.split(/\s+/)
.filter(c => c && !/^Mso/i.test(c))
.join(' ');
if (cls) {
el.setAttribute('class', cls);
} else {
el.removeAttribute('class');
}
}
});
const strikeElements = doc.querySelectorAll('s, strike, del');
strikeElements.forEach(el => {
const nestedLists = el.querySelectorAll('ul, ol');
if (nestedLists.length > 0) {
nestedLists.forEach(list => {
el.parentNode.insertBefore(list, el.nextSibling);
});
}
});
const lists = doc.querySelectorAll('ul, ol');
lists.forEach(list => {
const items = list.querySelectorAll(':scope > li');
items.forEach(item => {
const nestedLists = Array.from(item.children).filter(
child => child.tagName === 'UL' || child.tagName === 'OL',
);
nestedLists.forEach(nestedList => {
item.appendChild(nestedList);
});
});
});
return doc.body.innerHTML;
}

/**
* Pre-processes a raw Markdown string to convert custom syntax into HTML tags
* that Tiptap's extensions can understand. This is our custom "loader".
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
function stripMsoConditionalComments(html) {
return html.replace(/<!--\[if.*?endif\]-->/gis, '');
}

function stripOfficeNamespacedTags(html) {
return html.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
}

function filterMsoStyleDeclarations(doc) {
doc.querySelectorAll('[style]').forEach(el => {
const filtered = el
.getAttribute('style')
.split(';')
.map(s => s.trim())
.filter(s => s && !s.toLowerCase().startsWith('mso-'))
.join('; ');
if (filtered) {
el.setAttribute('style', filtered);
} else {
el.removeAttribute('style');
}
});
}

function filterMsoClasses(doc) {
doc.querySelectorAll('[class]').forEach(el => {
const cls = el
.getAttribute('class')
.split(/\s+/)
.filter(c => c && !/^Mso/i.test(c))
.join(' ');
if (cls) {
el.setAttribute('class', cls);
} else {
el.removeAttribute('class');
}
});
}

function hoistListsOutOfStrike(doc) {
doc.querySelectorAll('s, strike, del').forEach(el => {
el.querySelectorAll('ul, ol').forEach(list => {
el.parentNode.insertBefore(list, el.nextSibling);
});
});
}

function reparentNestedListsInLi(doc) {
doc.querySelectorAll('ul, ol').forEach(list => {
list.querySelectorAll(':scope > li').forEach(item => {
Array.from(item.children)
.filter(child => child.tagName === 'UL' || child.tagName === 'OL')
.forEach(nestedList => item.appendChild(nestedList));
});
});
}

function stripImages(doc) {
doc.querySelectorAll('img').forEach(el => el.remove());
}

const STRING_TRANSFORMS = [stripMsoConditionalComments, stripOfficeNamespacedTags];

const DOM_TRANSFORMS = [
filterMsoStyleDeclarations,
filterMsoClasses,
hoistListsOutOfStrike,
reparentNestedListsInLi,
stripImages,
];

export function transformPastedHTML(html) {
if (!html) return '';
let cleaned = html;
for (const transform of STRING_TRANSFORMS) {
cleaned = transform(cleaned);
}
const doc = new DOMParser().parseFromString(cleaned, 'text/html');
for (const transform of DOM_TRANSFORMS) {
transform(doc);
}
return doc.body.innerHTML;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import { transformPastedHTML } from '../TipTapEditor/utils/pasteTransform';

describe('transformPastedHTML', () => {
describe('empty inputs', () => {
it('returns empty string for empty input', () => {
expect(transformPastedHTML('')).toBe('');
});

it('returns empty string for null', () => {
expect(transformPastedHTML(null)).toBe('');
});

it('returns empty string for undefined', () => {
expect(transformPastedHTML(undefined)).toBe('');
});
});

describe('image stripping', () => {
it('strips a single remote img', () => {
const input = '<p>before <img src="https://example.com/x.png"> after</p>';
expect(transformPastedHTML(input)).toBe('<p>before after</p>');
});

it('strips a data: URI img', () => {
const input = '<p><img src="data:image/png;base64,iVBORw0KGgo="></p>';
expect(transformPastedHTML(input)).toBe('<p></p>');
});

it('strips img with no src', () => {
const input = '<p><img></p>';
expect(transformPastedHTML(input)).toBe('<p></p>');
});

it.each([
['http', '<img src="http://x.test/a.png">'],
['blob', '<img src="blob:https://x.test/abc">'],
['file', '<img src="file:///tmp/a.png">'],
['relative', '<img src="../a.png">'],
])('strips img with %s scheme', (_scheme, imgTag) => {
expect(transformPastedHTML(`<p>${imgTag}</p>`)).toBe('<p></p>');
});

it('strips multiple imgs in different parents', () => {
const input = [
'<p>top <img src="a"></p>',
'<img src="b">',
'<ul><li><img src="c"> item</li></ul>',
].join('');
const output = transformPastedHTML(input);
expect(output).not.toContain('<img');
expect(output).toContain('<p>top </p>');
expect(output).toContain('<ul><li> item</li></ul>');
});

it('preserves surrounding marks when stripping mixed imgs', () => {
const input =
'<p><strong>bold</strong> <img src="a"> <em>italic</em> <a href="https://x">link</a></p>';
const output = transformPastedHTML(input);
expect(output).not.toContain('<img');
expect(output).toContain('<strong>bold</strong>');
expect(output).toContain('<em>italic</em>');
expect(output).toContain('<a href="https://x">link</a>');
});
});

describe('Word/Office cleanup', () => {
it('removes MSO conditional comments', () => {
const input = '<p>before <!--[if gte mso 9]><xml>junk</xml><![endif]--> after</p>';
expect(transformPastedHTML(input)).toBe('<p>before after</p>');
});

it('removes Office-namespaced tags (w:, m:, o:, v:)', () => {
const input =
'<p>before<w:hint val="x"></w:hint><o:p></o:p><m:r></m:r><v:rect></v:rect>after</p>';
const output = transformPastedHTML(input);
expect(output).not.toMatch(/<\/?[wmov]:/);
expect(output).toContain('before');
expect(output).toContain('after');
});

it('strips mso-* style declarations while keeping other styles', () => {
const input =
'<p style="mso-list:l0 level1; color: red; mso-bidi-font-size: 11pt; font-size: 12pt">x</p>';
const output = transformPastedHTML(input);
expect(output).not.toMatch(/mso-/);
expect(output).toContain('color: red');
expect(output).toContain('font-size: 12pt');
});

it('removes the style attribute entirely when all declarations were mso-*', () => {
const input = '<p style="mso-list:l0 level1;mso-bidi-font-size: 11pt">x</p>';
expect(transformPastedHTML(input)).toBe('<p>x</p>');
});

it('strips Mso* classes (case-insensitive) while keeping other classes', () => {
const input = '<p class="MsoNormal kept-class MSOPlain">x</p>';
const output = transformPastedHTML(input);
expect(output).toContain('class="kept-class"');
expect(output).not.toMatch(/Mso/i);
});

it('removes the class attribute entirely when all classes were Mso*', () => {
const input = '<p class="MsoNormal MsoListParagraph">x</p>';
expect(transformPastedHTML(input)).toBe('<p>x</p>');
});

it('hoists nested lists out of strike/s/del wrappers', () => {
const input = '<s><ul><li>a</li></ul></s>';
const output = transformPastedHTML(input);
expect(output).toContain('<ul><li>a</li></ul>');
expect(output.indexOf('</s>')).toBeLessThan(output.indexOf('<ul>'));
});

it('re-parents nested lists inside <li> to the end of the <li>', () => {
const input = '<ul><li>text<ul><li>nested</li></ul>more text</li></ul>';
const output = transformPastedHTML(input);
expect(output).toMatch(/<li>textmore text<ul><li>nested<\/li><\/ul><\/li>/);
});
});

describe('idempotency', () => {
it.each([
['<p>plain text</p>'],
['<p>before <img src="x"> after</p>'],
['<p style="mso-bidi-font-size:11pt;color:red">x</p>'],
['<p class="MsoNormal kept">x</p>'],
['<s><ul><li>a</li></ul></s>'],
['<ul><li>text<ul><li>n</li></ul>more</li></ul>'],
['<!--[if gte mso 9]>x<![endif]--><p>y</p>'],
])('is idempotent: f(f(x)) === f(x) for %s', input => {
const once = transformPastedHTML(input);
const twice = transformPastedHTML(once);
expect(twice).toBe(once);
});
});
});
Loading