From e0fb809457cc410ca4aba52aca9c73cc4508645c Mon Sep 17 00:00:00 2001 From: bonggo-pras Date: Tue, 12 May 2026 16:15:10 +0700 Subject: [PATCH 1/8] perf: optimize body replacement and header/footer processing in DocxTemplate --- docxtpl/template.py | 97 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 23 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index a757037..e0b2036 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -18,9 +18,12 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE +import logging from jinja2 import Environment, Template, meta from jinja2.exceptions import TemplateError +logger = logging.getLogger(__name__) + def _create_optimized_env(**kwargs): """Create an optimized Jinja2 environment for better performance. @@ -130,6 +133,10 @@ class DocxTemplate(object): _RE_RUN_PROPS = re.compile(r".*?") _RE_PARA_PROPS = re.compile(r".*?") + # Precompiled pattern for fast detection of any Jinja syntax in a string. + # Used in render() to skip header/footer processing when no tags are present. + _JINJA_PATTERN = re.compile(r'\{\{|\{%|\{#') + def __init__(self, template_file: Union[IO[bytes], str, PathLike]) -> None: self.template_file = template_file self.reset_replacements() @@ -467,21 +474,54 @@ def build_xml(self, context, jinja_env=None): return xml def map_tree(self, tree): - """Replace body content with rendered tree. - - Instead of replacing the entire element with replace() (which - triggers expensive reconciliation), we now mutate the body's children - directly. This is much cheaper for large trees. + """Replace the body element with the rendered tree. + + Uses root.remove() + root.insert(index) instead of root.replace() to + avoid lxml's O(n) recursive cleanup on large XML trees. The body + index is located first so document element order (body before sectPr) + is preserved. + + SAFETY: If the body is not a direct child of root (malformed template) + or if remove/insert raises for any reason, we fall back to copying + children so rendering is never broken by this optimisation. """ - body = self.docx._element.body - - # Remove all existing children from body - for child in list(body): - body.remove(child) - - # Append all children from the new tree - for child in list(tree): - body.append(child) + root = self.docx._element + old_body = root.body + + # Locate the body's position among root's direct children. + body_index = None + for i, child in enumerate(root): + if child is old_body: + body_index = i + break + + if body_index is None: + # Malformed template – body is not a direct child; fall back. + logger.warning( + "map_tree: body is not a direct child of root (malformed template?). " + "Falling back to child-copy implementation." + ) + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) + return + + try: + root.remove(old_body) + root.insert(body_index, tree) + except Exception: + logger.warning( + "map_tree: optimized remove/insert failed; falling back to child-copy.", + exc_info=True, + ) + # Re-attach old_body if it was already removed before the failure. + if old_body.getparent() is None: + root.insert(body_index, old_body) + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) def get_headers_footers(self, uri): for relKey, val in self.docx._part.rels.items(): @@ -546,15 +586,26 @@ def render( # Replace body xml tree self.map_tree(tree) - # Headers - headers = self.build_headers_footers_xml(context, self.HEADER_URI, jinja_env) - for relKey, xml in headers: - self.map_headers_footers_xml(relKey, xml) - - # Footers - footers = self.build_headers_footers_xml(context, self.FOOTER_URI, jinja_env) - for relKey, xml in footers: - self.map_headers_footers_xml(relKey, xml) + # Headers & Footers – skip entirely when no Jinja tags are present to + # avoid unnecessary XML parsing, patch_xml, and part replacement. + for uri in (self.HEADER_URI, self.FOOTER_URI): + try: + has_jinja = any( + self._JINJA_PATTERN.search(self.get_part_xml(part)) + for _relKey, part in self.get_headers_footers(uri) + ) + if has_jinja: + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) + except Exception: + logger.warning( + "render: header/footer Jinja-tag check failed for %s; " + "falling back to full processing.", + uri, + exc_info=True, + ) + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) self.render_properties(context, jinja_env) From c82d2a449bdb37c3c22792b5aa2e21c4037aa616 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 11:21:13 +0100 Subject: [PATCH 2/8] Remove logging warnings in template.py Delete the module-level logger and several logger.warning calls in docxtpl/template.py. Added while debugging and should be removed. --- docxtpl/template.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index e0b2036..e4ba92c 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -22,8 +22,6 @@ from jinja2 import Environment, Template, meta from jinja2.exceptions import TemplateError -logger = logging.getLogger(__name__) - def _create_optimized_env(**kwargs): """Create an optimized Jinja2 environment for better performance. @@ -497,10 +495,6 @@ def map_tree(self, tree): if body_index is None: # Malformed template – body is not a direct child; fall back. - logger.warning( - "map_tree: body is not a direct child of root (malformed template?). " - "Falling back to child-copy implementation." - ) for child in list(old_body): old_body.remove(child) for child in list(tree): @@ -511,10 +505,6 @@ def map_tree(self, tree): root.remove(old_body) root.insert(body_index, tree) except Exception: - logger.warning( - "map_tree: optimized remove/insert failed; falling back to child-copy.", - exc_info=True, - ) # Re-attach old_body if it was already removed before the failure. if old_body.getparent() is None: root.insert(body_index, old_body) @@ -598,12 +588,6 @@ def render( for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml) except Exception: - logger.warning( - "render: header/footer Jinja-tag check failed for %s; " - "falling back to full processing.", - uri, - exc_info=True, - ) for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml) From efd473b7119034988f7f7168fc23128c25867bed Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 11:34:51 +0100 Subject: [PATCH 3/8] Clarify body-swap docstring and comments Improve documentation in map_tree to explain the optimization: the code swaps the entire via root.remove() + root.insert() to avoid O(n) per-child lxml operations, which is effectively O(1) on the document root. Clarify that the body's index is preserved so element order (body before sectPr) remains intact, and spell out the fallback behavior (child-by-child copy) if the body isn't a direct child or if remove/insert fails. Add additional safety and explanatory comments. --- docxtpl/template.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index e4ba92c..a44b18c 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -474,19 +474,24 @@ def build_xml(self, context, jinja_env=None): def map_tree(self, tree): """Replace the body element with the rendered tree. - Uses root.remove() + root.insert(index) instead of root.replace() to - avoid lxml's O(n) recursive cleanup on large XML trees. The body - index is located first so document element order (body before sectPr) - is preserved. + Instead of iterating over all body children to remove/re-append them + one-by-one (O(n) lxml operations, each with internal bookkeeping), + we swap the entire element in the document root using + root.remove() + root.insert(). This is O(1) since the root element + () has only a handful of direct children. + + The body's index is located first so document element order is + preserved (e.g. body before sectPr). SAFETY: If the body is not a direct child of root (malformed template) - or if remove/insert raises for any reason, we fall back to copying - children so rendering is never broken by this optimisation. + or if remove/insert raises for any reason, we fall back to the slower + child-by-child copy so rendering is never broken. """ root = self.docx._element old_body = root.body - # Locate the body's position among root's direct children. + # Find where sits among root's direct children so we can + # re-insert the new tree at the same position. body_index = None for i, child in enumerate(root): if child is old_body: @@ -494,7 +499,8 @@ def map_tree(self, tree): break if body_index is None: - # Malformed template – body is not a direct child; fall back. + # Malformed template – body is not a direct child of root. + # Fall back to child-by-child replacement on the existing body. for child in list(old_body): old_body.remove(child) for child in list(tree): @@ -502,10 +508,15 @@ def map_tree(self, tree): return try: + # Detach the old body and insert the new tree (which is itself a + # element returned by fix_tables/parse_xml) at the same + # position. This avoids O(n) per-child remove/append calls. root.remove(old_body) root.insert(body_index, tree) except Exception: - # Re-attach old_body if it was already removed before the failure. + # If something went wrong, restore the document to a usable state + # by re-attaching the old body (if it was already detached) and + # falling back to child-by-child copy. if old_body.getparent() is None: root.insert(body_index, old_body) for child in list(old_body): From 84c14206946f98c4195023b7507280394ffd07ac Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 11:46:14 +0100 Subject: [PATCH 4/8] Improve header/footer Jinja detection and fallback Enhance header/footer processing by detecting Jinja tags split across Word XML runs: check both intact tags (_JINJA_PATTERN) and open-tag fragments (_RE_JINJA_OPEN) when scanning part XML. Use a generator to iterate part XML strings once, and keep the existing exception fallback to unconditionally render headers/footers if the fast-path check fails (e.g. malformed XML). Also add clarifying comments about properties and footnotes skipping behaviour and make minor comment style fixes. --- docxtpl/template.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index a44b18c..1a0f5b0 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -578,38 +578,49 @@ def render( # Body xml_src = self.build_xml(context, jinja_env) - # fix tables if needed + # Fix tables if needed tree = self.fix_tables(xml_src) - # fix docPr ID's + # Fix docPr ID's self.fix_docpr_ids(tree) # Replace body xml tree self.map_tree(tree) - # Headers & Footers – skip entirely when no Jinja tags are present to - # avoid unnecessary XML parsing, patch_xml, and part replacement. + # Headers & Footers - skip when no Jinja tags are present. + # Uses both _JINJA_PATTERN (intact tags) and _RE_JINJA_OPEN (tags + # split across XML runs by Word). Falls back to full render on error. for uri in (self.HEADER_URI, self.FOOTER_URI): try: has_jinja = any( - self._JINJA_PATTERN.search(self.get_part_xml(part)) - for _relKey, part in self.get_headers_footers(uri) + self._JINJA_PATTERN.search(xml) + or self._RE_JINJA_OPEN.search(xml) + for xml in ( + self.get_part_xml(part) + for _relKey, part in self.get_headers_footers(uri) + ) ) if has_jinja: for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml) except Exception: + # Fallback: if the fast-path check raises (e.g. malformed XML + # in a part), process all headers/footers unconditionally. for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml) + # Properties: no skip-check needed - these are a handful of short + # strings (author, title, etc.) where from_string() is near-zero cost. self.render_properties(context, jinja_env) + # Footnotes: no skip-check needed - at most one part exists in typical + # documents, and many have none, so the loop body rarely executes. self.render_footnotes(context, jinja_env) # set rendered flag self.is_rendered = True - # using of TC tag in for cycle can cause that count of columns does not + # Using of TC tag in for cycle can cause that count of columns does not # correspond to real count of columns in row. def fix_tables(self, xml): # Use parse_xml with safe fallback for malformed XML From e5106f3a2caf1ab216e07f0341c57406be026f12 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 12:08:28 +0100 Subject: [PATCH 5/8] Optimize resolve_listing with early exit Add a fast-path to DocxTemplate.resolve_listing that returns the input XML unchanged when no Listing special characters are present. The check looks for tab, newline, bell and form-feed ("\t", "\n", "\a", "\f") and avoids running the heavier resolution logic in the common case, improving performance without changing behavior. --- docxtpl/template.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docxtpl/template.py b/docxtpl/template.py index 1a0f5b0..8dfd6b3 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -419,6 +419,10 @@ def render_footnotes( part._blob = xml.encode("utf-8") def resolve_listing(self, xml): + # Early exit: if no Listing special characters are present (common case), + # there's nothing to resolve, skip the work below. + if "\t" not in xml and "\n" not in xml and "\a" not in xml and "\f" not in xml: + return xml def resolve_text(run_properties, paragraph_properties, m): xml = m.group(0).replace( From a5c3286d711362184d036934abff7bf47c315e39 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 12:22:36 +0100 Subject: [PATCH 6/8] Precompile tag-stripping regexes in DocxTemplate Introduce pre-compiled regex patterns (_RE_TAG_STRIP and _RE_COMMENT_STRIP) to strip surrounding tags from template tags like {%y ...%}, {{y ...}} and comments {#y ...#}. Replace repeated re.sub loops with iteration over these patterns to avoid recompiling the same regexes on every call, reduce code duplication, and improve performance/maintainability. --- docxtpl/template.py | 48 +++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index 8dfd6b3..66f963c 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -131,6 +131,26 @@ class DocxTemplate(object): _RE_RUN_PROPS = re.compile(r".*?") _RE_PARA_PROPS = re.compile(r".*?") + # Pre-compiled patterns for tag-stripping in patch_xml(). + # Strips surrounding tags from {%y ...%} / {{y ...}} template tags. + _RE_TAG_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({%%|{{)%s ([^}%%]*(?:%%}|}})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p", "r") + ) + # Same for {#y ...#} comment tags (not 'r' - comments in runs are uncommon). + _RE_COMMENT_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({#)%s ([^}#]*(?:#})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p") + ) + # Precompiled pattern for fast detection of any Jinja syntax in a string. # Used in render() to skip header/footer processing when no tags are present. _JINJA_PATTERN = re.compile(r'\{\{|\{%|\{#') @@ -229,25 +249,15 @@ def cellbg(m): # -%} will merge with next paragraph text src_xml = self._RE_MERGE_NEXT.sub("%}", src_xml) - for y in ["tr", "tc", "p", "r"]: - # replace into xml code the row/paragraph/run containing - # {%y xxx %} or {{y xxx}} template tag - # by {% xxx %} or {{ xx }} without any surrounding tags : - # This is mandatory to have jinja2 generating correct xml code - pat = ( - r"](?:(?!]).)*({%%|{{)%(y)s ([^}%%]*(?:%%}|}})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) - - for y in ["tr", "tc", "p"]: - # same thing, but for {#y xxx #} (but not where y == 'r', since that - # makes less sense to use comments in that context - pat = ( - r"](?:(?!]).)*({#)%(y)s ([^}#]*(?:#})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) + # Strip surrounding tags from {%y ...%} / {{y ...}} template tags. + # This is mandatory for jinja2 to generate correct xml code. + # Patterns are pre-compiled as class attributes to avoid recompilation. + for pat in self._RE_TAG_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) + + # Same for {#y ...#} comment tags (not 'r' — comments in runs are uncommon). + for pat in self._RE_COMMENT_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) # add vMerge # use {% vm %} to make this table cell and its copies From c042ae27b8e0ce093dfd662f9e33838c25bada58 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 13:42:09 +0100 Subject: [PATCH 7/8] Remove unused imports from template.py Clean up docxtpl/template.py by removing unused imports: functools, logging, and Template from jinja2. Keeps Environment and meta from jinja2 and does not change runtime behavior; this reduces linter warnings and unnecessary dependencies. --- docxtpl/template.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index 66f963c..59cb62c 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -8,7 +8,6 @@ from os import PathLike from typing import TYPE_CHECKING, Any, Optional, IO, Union, Dict, Set -import functools import io from lxml import etree from docx import Document @@ -18,8 +17,7 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE -import logging -from jinja2 import Environment, Template, meta +from jinja2 import Environment, meta from jinja2.exceptions import TemplateError From ac57d571c32a3613ffe9f9192c8235cf78d818d4 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 13:48:58 +0100 Subject: [PATCH 8/8] Clarify header/footer fallback comment Update comment in docxtpl/template.py to clarify the fallback behavior when processing headers and footers. The comment now explains the fallback guards against unexpected part structure (e.g. blob is None or missing attributes) rather than implying it handles malformed XML; malformed XML would still fail in build_headers_footers_xml. No functional change. --- docxtpl/template.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index 59cb62c..abcff49 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -601,7 +601,7 @@ def render( # Headers & Footers - skip when no Jinja tags are present. # Uses both _JINJA_PATTERN (intact tags) and _RE_JINJA_OPEN (tags - # split across XML runs by Word). Falls back to full render on error. + # split across XML runs by Word). for uri in (self.HEADER_URI, self.FOOTER_URI): try: has_jinja = any( @@ -616,8 +616,9 @@ def render( for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml) except Exception: - # Fallback: if the fast-path check raises (e.g. malformed XML - # in a part), process all headers/footers unconditionally. + # Fallback: guards against unexpected part structure (e.g. blob + # is None, missing attributes). Not malformed XML - that would + # fail in build_headers_footers_xml regardless. for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): self.map_headers_footers_xml(relKey, xml)