diff options
| author | Kyle Javier [kj_sh604] | 2026-02-28 15:02:16 -0500 |
|---|---|---|
| committer | GitHub | 2026-02-28 15:02:16 -0500 |
| commit | 8f9756189c777074b88de39c2de1e2f7153352c2 (patch) | |
| tree | 9810f206bc9b97f1429bc7f207caf80f6ead7986 | |
| parent | fafc3e29832779b5ccbea8fd21dc9fd5af67de38 (diff) | |
| parent | 47a9736a1dfa8bfd4c5e5edd111e6ad28536066f (diff) | |
[merge] pull request #2 from kj-sh604/feat/use-dash-st-rewrite
# feat: use `feat/same-template-concat` branch `-st` implementation as main `kjandoc` binary
this pull request updates the project to improve the quality and fidelity of merged `.pptx` files, and simplifies dependencies.
the most significant changes are a rewrite of the merging approach to preserve editability and formatting, and the removal of several python dependencies that are no longer needed (as seen in the `feat/same-template-concat` that I still have up)
## enhancements to merging functionality:
* the merging process now operates directly on the ooxml/zip structure of `.pptx` files, preserving full editability and achieving near-complete fidelity to the original formatting. slide masters, layouts, themes, notes, and embedded media are all copied, and duplicate media files are deduplicated.
* a final libreoffice normalization step is used to clean up structural issues.
## dependency updates:
* removed unnecessary dependencies from `src/requirements.txt`, including `pillow`, `python-pptx`, `typing_extensions`, and `xlsxwriter`, leaving only `lxml` as a required python package.
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | README.md | 23 | ||||
| -rwxr-xr-x | src/kjandoc | 842 | ||||
| -rw-r--r-- | src/requirements.txt | 4 |
4 files changed, 719 insertions, 151 deletions
| @@ -1 +1,2 @@ | |||
| 1 | .venv | 1 | .venv |
| 2 | __pycache__/ | ||
| @@ -10,31 +10,32 @@ https://github.com/user-attachments/assets/c7fe58c1-ff76-41bf-977b-870247a6a3e2 | |||
| 10 | 10 | ||
| 11 | ## what it does | 11 | ## what it does |
| 12 | - merges multiple .pptx files into one | 12 | - merges multiple .pptx files into one |
| 13 | - preserves visual formatting by rendering slides and rebuilding a new deck | 13 | - preserves full editability, with 99% fidelity to the original formatting (some minor quirks may occur) |
| 14 | - copies slide masters, layouts, themes, notes, and embedded media | ||
| 14 | - `pandoc`-style usage: `kjandoc input1.pptx input2.pptx -o combined.pptx` | 15 | - `pandoc`-style usage: `kjandoc input1.pptx input2.pptx -o combined.pptx` |
| 15 | 16 | ||
| 16 | ## why this exists | 17 | ## why this exists |
| 17 | `pandoc` is great, but it can't concatenate `.pptx` files. | 18 | `pandoc` is great, but it can't concatenate `.pptx` files. |
| 18 | 19 | ||
| 19 | this uses a headless libreoffice + pdf -> png rendering to get a merge with most formatting preserved. | 20 | this works directly at the OOXML/ZIP level: it reads each `.pptx` as a ZIP archive, rewires all internal XML relationships, and writes a new near full Microsoft-compliant `.pptx`. |
| 20 | 21 | ||
| 21 | the tradeoff is the output slides are images (not editable shapes). | 22 | a final LibreOffice normalization pass cleans up any lingering structural quirks to prevent PowerPoint repair prompts (not guaranteed though). |
| 22 | 23 | ||
| 23 | ## usage | 24 | ## usage |
| 24 | ```bash | 25 | ```bash |
| 25 | # pandoc-style usage | 26 | # pandoc-style usage |
| 26 | ./kjandoc input1.pptx input2.pptx -o combined.pptx | 27 | ./kjandoc input1.pptx input2.pptx -o combined.pptx |
| 27 | 28 | ||
| 28 | # tweak quality | 29 | # merge more than two |
| 29 | ./kjandoc input1.pptx input2.pptx -o combined.pptx --dpi 150 | 30 | ./kjandoc a.pptx b.pptx c.pptx -o combined.pptx |
| 30 | ``` | 31 | ``` |
| 31 | 32 | ||
| 32 | ## deps | 33 | ## deps |
| 33 | - python3 | 34 | - python3 |
| 34 | - libreoffice | 35 | - libreoffice (for the normalization pass) |
| 35 | - poppler (pdftoppm) | 36 | - python deps in requirements.txt (`lxml`) |
| 36 | - python deps in requirements.txt | ||
| 37 | 37 | ||
| 38 | ## notes | 38 | ## notes |
| 39 | - output size is larger (images) | 39 | - output slides are fully editable |
| 40 | - visuals stay intact for the most part | 40 | - masters and layouts from all source files are carried over |
| 41 | - duplicate media files are deduplicated automatically | ||
diff --git a/src/kjandoc b/src/kjandoc index 233b59e..56cc147 100755 --- a/src/kjandoc +++ b/src/kjandoc | |||
| @@ -1,184 +1,754 @@ | |||
| 1 | #!/usr/bin/env python3 | 1 | #!/usr/bin/env python3 |
| 2 | 2 | ||
| 3 | # merge pptx files using libreoffice slide rendering + python-pptx reconstruction. | 3 | import argparse |
| 4 | # creative approach: render each slide as a high-res image via libreoffice, | 4 | import hashlib |
| 5 | # then stitch them into one pptx. this preserves 100% of the visual formatting | ||
| 6 | # since we're working with exact raster snapshots of each slide. | ||
| 7 | |||
| 8 | import os | 5 | import os |
| 9 | import sys | 6 | import re |
| 10 | import glob | ||
| 11 | import shutil | 7 | import shutil |
| 12 | import subprocess | 8 | import subprocess |
| 9 | import sys | ||
| 13 | import tempfile | 10 | import tempfile |
| 11 | import zipfile | ||
| 12 | from copy import deepcopy | ||
| 14 | from pathlib import Path | 13 | from pathlib import Path |
| 15 | from PIL import Image | ||
| 16 | from pptx import Presentation | ||
| 17 | from pptx.util import Emu | ||
| 18 | 14 | ||
| 15 | from lxml import etree | ||
| 19 | 16 | ||
| 20 | DPI = 300 # high-res export | ||
| 21 | 17 | ||
| 18 | # OOXML | ||
| 22 | 19 | ||
| 23 | def pptx_to_images(pptx_path, output_dir): | 20 | _PKG_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships' |
| 24 | # use libreoffice to convert pptx -> pdf, then pdf -> images via pdftoppm. | 21 | _PKG_CT = 'http://schemas.openxmlformats.org/package/2006/content-types' |
| 25 | # fallback: libreoffice direct png export if pdftoppm unavailable. | 22 | _NS_P = 'http://schemas.openxmlformats.org/presentationml/2006/main' |
| 26 | pptx_path = os.path.abspath(pptx_path) | 23 | _NS_R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' |
| 27 | pdf_path = os.path.join(output_dir, Path(pptx_path).stem + '.pdf') | ||
| 28 | 24 | ||
| 29 | # step 1: pptx -> pdf via libreoffice (preserves all formatting) | 25 | # relationship type URIs |
| 30 | subprocess.run([ | 26 | _RT_SLIDE = _NS_R + '/slide' |
| 31 | 'libreoffice', '--headless', '--convert-to', 'pdf', | 27 | _RT_LAYOUT = _NS_R + '/slideLayout' |
| 32 | '--outdir', output_dir, pptx_path | 28 | _RT_MASTER = _NS_R + '/slideMaster' |
| 33 | ], check=True, capture_output=True) | 29 | _RT_THEME = _NS_R + '/theme' |
| 30 | _RT_NOTES = _NS_R + '/notesSlide' | ||
| 31 | _RT_NOTES_MASTER = _NS_R + '/notesMaster' | ||
| 32 | _RT_IMAGE = _NS_R + '/image' | ||
| 34 | 33 | ||
| 35 | if not os.path.exists(pdf_path): | 34 | # relationship types that target binary media files |
| 36 | raise RuntimeError(f"libreoffice failed to produce {pdf_path}") | 35 | _MEDIA_TYPES = { |
| 36 | _RT_IMAGE, | ||
| 37 | _NS_R + '/audio', | ||
| 38 | _NS_R + '/video', | ||
| 39 | 'http://schemas.microsoft.com/office/2007/relationships/media', | ||
| 40 | _NS_R + '/oleObject', | ||
| 41 | } | ||
| 37 | 42 | ||
| 38 | # step 2: pdf -> png images | 43 | # content type strings for OOXML parts |
| 39 | # try pdftoppm first (from poppler-utils, higher quality) | 44 | _CT_SLIDE = 'application/vnd.openxmlformats-officedocument.presentationml.slide+xml' |
| 40 | image_prefix = os.path.join(output_dir, 'slide') | 45 | _CT_LAYOUT = 'application/vnd.openxmlformats-officedocument.presentationml.slideLayout+xml' |
| 46 | _CT_MASTER = 'application/vnd.openxmlformats-officedocument.presentationml.slideMaster+xml' | ||
| 47 | _CT_THEME = 'application/vnd.openxmlformats-officedocument.theme+xml' | ||
| 48 | _CT_NOTES = 'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml' | ||
| 41 | 49 | ||
| 42 | if shutil.which('pdftoppm'): | 50 | # common media MIME types by extension |
| 43 | subprocess.run([ | 51 | _MIME = { |
| 44 | 'pdftoppm', '-png', '-r', str(DPI), | 52 | 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', |
| 45 | pdf_path, image_prefix | 53 | 'gif': 'image/gif', 'svg': 'image/svg+xml', 'emf': 'image/x-emf', |
| 46 | ], check=True, capture_output=True) | 54 | 'wmf': 'image/x-wmf', 'tiff': 'image/tiff', 'tif': 'image/tiff', |
| 55 | 'bmp': 'image/bmp', 'wdp': 'image/vnd.ms-photo', | ||
| 56 | 'mp3': 'audio/mpeg', 'wav': 'audio/wav', | ||
| 57 | 'mp4': 'video/mp4', 'm4v': 'video/mp4', | ||
| 58 | } | ||
| 59 | |||
| 60 | |||
| 61 | # utility functions | ||
| 62 | |||
| 63 | def _resolve(base, target): | ||
| 64 | """Resolve a relative relationship target against a base part path. | ||
| 65 | |||
| 66 | >>> _resolve('ppt/slides/slide1.xml', '../slideLayouts/slideLayout1.xml') | ||
| 67 | 'ppt/slideLayouts/slideLayout1.xml' | ||
| 68 | """ | ||
| 69 | if target.startswith('/'): | ||
| 70 | return target.lstrip('/') | ||
| 71 | return os.path.normpath(os.path.join(os.path.dirname(base), target)).replace('\\', '/') | ||
| 72 | |||
| 73 | |||
| 74 | def _relpath(from_part, to_part): | ||
| 75 | """Relative path from one part's directory to another part. | ||
| 76 | |||
| 77 | >>> _relpath('ppt/slides/slide1.xml', 'ppt/slideLayouts/slideLayout2.xml') | ||
| 78 | '../slideLayouts/slideLayout2.xml' | ||
| 79 | """ | ||
| 80 | return os.path.relpath(to_part, os.path.dirname(from_part)).replace('\\', '/') | ||
| 81 | |||
| 82 | |||
| 83 | def _max_num(names, pattern): | ||
| 84 | """Find highest number captured by group(1) of pattern across names.""" | ||
| 85 | mx, rx = 0, re.compile(pattern) | ||
| 86 | for name in names: | ||
| 87 | m = rx.search(name) | ||
| 88 | if m: | ||
| 89 | mx = max(mx, int(m.group(1))) | ||
| 90 | return mx | ||
| 91 | |||
| 92 | |||
| 93 | def _next_rid(rels_element): | ||
| 94 | """Next available rId number from a Relationships XML element.""" | ||
| 95 | mx = 0 | ||
| 96 | for rel in rels_element: | ||
| 97 | m = re.match(r'rId(\d+)', rel.get('Id', '')) | ||
| 98 | if m: | ||
| 99 | mx = max(mx, int(m.group(1))) | ||
| 100 | return mx + 1 | ||
| 101 | |||
| 102 | |||
| 103 | def _sha256(data): | ||
| 104 | return hashlib.sha256(data).hexdigest() | ||
| 105 | |||
| 106 | |||
| 107 | # in-memory PPTX package | ||
| 108 | class _Pkg: | ||
| 109 | """In-memory representation of a PPTX file (an OOXML ZIP package).""" | ||
| 110 | |||
| 111 | def __init__(self, path=None): | ||
| 112 | self.parts = {} # partname -> bytes | ||
| 113 | if path: | ||
| 114 | with zipfile.ZipFile(path) as zf: | ||
| 115 | for name in zf.namelist(): | ||
| 116 | self.parts[name] = zf.read(name) | ||
| 117 | |||
| 118 | def save(self, path): | ||
| 119 | with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf: | ||
| 120 | for name in sorted(self.parts): | ||
| 121 | zf.writestr(name, self.parts[name]) | ||
| 122 | |||
| 123 | def xml(self, partname): | ||
| 124 | """Parse a part's bytes as XML, return lxml Element.""" | ||
| 125 | return etree.fromstring(self.parts[partname]) | ||
| 126 | |||
| 127 | def set_xml(self, partname, element): | ||
| 128 | """Serialize an lxml Element back into the package.""" | ||
| 129 | self.parts[partname] = etree.tostring( | ||
| 130 | element, xml_declaration=True, encoding='UTF-8', standalone=True | ||
| 131 | ) | ||
| 132 | |||
| 133 | def rels_path(self, partname): | ||
| 134 | """Return the .rels path for a given part.""" | ||
| 135 | d, b = os.path.dirname(partname), os.path.basename(partname) | ||
| 136 | return f'{d}/_rels/{b}.rels' | ||
| 137 | |||
| 138 | def get_rels(self, partname): | ||
| 139 | """Parse the .rels XML for a part, or return None if absent.""" | ||
| 140 | rp = self.rels_path(partname) | ||
| 141 | if rp in self.parts: | ||
| 142 | return self.xml(rp) | ||
| 143 | return None | ||
| 144 | |||
| 145 | def set_rels(self, partname, element): | ||
| 146 | """Write the .rels XML for a part.""" | ||
| 147 | rp = self.rels_path(partname) | ||
| 148 | self.set_xml(rp, element) | ||
| 149 | |||
| 150 | def find_rel(self, partname, rel_type): | ||
| 151 | """Find first internal relationship of a given type. | ||
| 152 | |||
| 153 | Returns (rId, resolved_target_partname) or None. | ||
| 154 | """ | ||
| 155 | root = self.get_rels(partname) | ||
| 156 | if root is None: | ||
| 157 | return None | ||
| 158 | for rel in root: | ||
| 159 | if rel.get('Type') == rel_type and rel.get('TargetMode') != 'External': | ||
| 160 | return rel.get('Id'), _resolve(partname, rel.get('Target')) | ||
| 161 | return None | ||
| 162 | |||
| 163 | |||
| 164 | # content type helpers | ||
| 165 | |||
| 166 | def _add_override(pkg, partname, content_type): | ||
| 167 | """Add an Override entry to [Content_Types].xml if not already present.""" | ||
| 168 | root = pkg.xml('[Content_Types].xml') | ||
| 169 | abs_name = '/' + partname.lstrip('/') | ||
| 170 | for o in root.findall(f'{{{_PKG_CT}}}Override'): | ||
| 171 | if o.get('PartName') == abs_name: | ||
| 172 | return | ||
| 173 | etree.SubElement(root, f'{{{_PKG_CT}}}Override', | ||
| 174 | PartName=abs_name, ContentType=content_type) | ||
| 175 | pkg.set_xml('[Content_Types].xml', root) | ||
| 176 | |||
| 177 | |||
| 178 | def _add_default_ext(pkg, ext, content_type): | ||
| 179 | """Add a Default entry for a file extension to [Content_Types].xml.""" | ||
| 180 | root = pkg.xml('[Content_Types].xml') | ||
| 181 | for d in root.findall(f'{{{_PKG_CT}}}Default'): | ||
| 182 | if d.get('Extension', '').lower() == ext.lower(): | ||
| 183 | return | ||
| 184 | etree.SubElement(root, f'{{{_PKG_CT}}}Default', | ||
| 185 | Extension=ext, ContentType=content_type) | ||
| 186 | pkg.set_xml('[Content_Types].xml', root) | ||
| 187 | |||
| 188 | |||
| 189 | def _copy_content_type(tgt, src, src_part, tgt_part): | ||
| 190 | """Copy a content type override from source package to target.""" | ||
| 191 | src_root = src.xml('[Content_Types].xml') | ||
| 192 | abs_src = '/' + src_part.lstrip('/') | ||
| 193 | for o in src_root.findall(f'{{{_PKG_CT}}}Override'): | ||
| 194 | if o.get('PartName') == abs_src: | ||
| 195 | _add_override(tgt, tgt_part, o.get('ContentType')) | ||
| 196 | return | ||
| 197 | |||
| 198 | |||
| 199 | # merge context | ||
| 200 | class _MergeCtx: | ||
| 201 | """Tracks source→target part name mappings during merge of one source file. | ||
| 202 | |||
| 203 | Prevents copying the same source part twice and lets us reuse the target | ||
| 204 | name when multiple slides share the same layout/master/theme/media. | ||
| 205 | """ | ||
| 206 | def __init__(self): | ||
| 207 | self.layouts = {} # src partname -> tgt partname | ||
| 208 | self.masters = {} # src partname -> tgt partname | ||
| 209 | self.themes = {} # src partname -> tgt partname | ||
| 210 | self.media = {} # src partname -> tgt partname | ||
| 211 | self.notes = {} # src partname -> tgt partname | ||
| 212 | self.generic = {} # src partname -> tgt partname | ||
| 213 | |||
| 214 | |||
| 215 | # part copy functions | ||
| 216 | def _copy_media(ctx, tgt, src, src_part): | ||
| 217 | """Copy a media file (image/audio/video) to target, deduplicating by hash.""" | ||
| 218 | if src_part in ctx.media: | ||
| 219 | return ctx.media[src_part] | ||
| 220 | if src_part not in src.parts: | ||
| 221 | return src_part | ||
| 222 | |||
| 223 | data = src.parts[src_part] | ||
| 224 | h = _sha256(data) | ||
| 225 | |||
| 226 | # deduplicate: reuse existing media in target with same content | ||
| 227 | for tgt_name, tgt_data in tgt.parts.items(): | ||
| 228 | if tgt_name.startswith('ppt/media/') and _sha256(tgt_data) == h: | ||
| 229 | ctx.media[src_part] = tgt_name | ||
| 230 | return tgt_name | ||
| 231 | |||
| 232 | ext = os.path.splitext(src_part)[1] | ||
| 233 | n = _max_num(tgt.parts, r'ppt/media/\w+?(\d+)') + 1 | ||
| 234 | tgt_part = f'ppt/media/media{n}{ext}' | ||
| 235 | tgt.parts[tgt_part] = data | ||
| 236 | |||
| 237 | # ensure file extension has a registered MIME type | ||
| 238 | ext_lower = ext.lstrip('.').lower() | ||
| 239 | if ext_lower in _MIME: | ||
| 240 | _add_default_ext(tgt, ext_lower, _MIME[ext_lower]) | ||
| 241 | |||
| 242 | ctx.media[src_part] = tgt_part | ||
| 243 | return tgt_part | ||
| 244 | |||
| 245 | |||
| 246 | def _copy_generic(ctx, tgt, src, src_part): | ||
| 247 | """Copy a generic part (tags, charts, embeddings, etc.) to target.""" | ||
| 248 | if src_part in ctx.generic: | ||
| 249 | return ctx.generic[src_part] | ||
| 250 | if src_part not in src.parts: | ||
| 251 | return src_part | ||
| 252 | |||
| 253 | # allocate a new name with incremented number | ||
| 254 | m = re.match(r'(.+?)(\d+)(\.\w+)$', src_part) | ||
| 255 | if m: | ||
| 256 | prefix, _, ext = m.groups() | ||
| 257 | n = _max_num(tgt.parts, re.escape(prefix) + r'(\d+)' + re.escape(ext)) + 1 | ||
| 258 | tgt_part = f'{prefix}{n}{ext}' | ||
| 259 | elif src_part in tgt.parts: | ||
| 260 | base, ext = os.path.splitext(src_part) | ||
| 261 | n = 1 | ||
| 262 | while f'{base}_{n}{ext}' in tgt.parts: | ||
| 263 | n += 1 | ||
| 264 | tgt_part = f'{base}_{n}{ext}' | ||
| 47 | else: | 265 | else: |
| 48 | # fallback: use libreoffice to export as images directly | 266 | tgt_part = src_part |
| 49 | # this works but pdftoppm gives better quality | ||
| 50 | subprocess.run([ | ||
| 51 | 'libreoffice', '--headless', '--convert-to', 'png', | ||
| 52 | '--outdir', output_dir, pdf_path | ||
| 53 | ], check=True, capture_output=True) | ||
| 54 | 267 | ||
| 55 | # collect and sort image files | 268 | tgt.parts[tgt_part] = src.parts[src_part] |
| 56 | images = sorted(glob.glob(os.path.join(output_dir, 'slide-*.png'))) | ||
| 57 | if not images: | ||
| 58 | images = sorted(glob.glob(os.path.join(output_dir, '*.png'))) | ||
| 59 | 269 | ||
| 60 | if not images: | 270 | # register early to prevent infinite recursion from circular rels |
| 61 | raise RuntimeError("no slide images produced") | 271 | ctx.generic[src_part] = tgt_part |
| 62 | 272 | ||
| 63 | return images | 273 | _copy_content_type(tgt, src, src_part, tgt_part) |
| 274 | _copy_simple_rels(ctx, tgt, src, src_part, tgt_part) | ||
| 64 | 275 | ||
| 276 | return tgt_part | ||
| 65 | 277 | ||
| 66 | def images_to_pptx(image_groups, output_path, slide_width_emu=9144000, slide_height_emu=6858000): | ||
| 67 | # build a pptx from slide images, one image per slide filling the entire area. | ||
| 68 | prs = Presentation() | ||
| 69 | prs.slide_width = slide_width_emu | ||
| 70 | prs.slide_height = slide_height_emu | ||
| 71 | 278 | ||
| 72 | # use blank layout (index 6 is typically blank) | 279 | def _copy_simple_rels(ctx, tgt, src, src_part, tgt_part): |
| 73 | blank_layout = None | 280 | """Copy relationships for a part, handling media and generic sub-parts.""" |
| 74 | for layout in prs.slide_layouts: | 281 | src_rels = src.get_rels(src_part) |
| 75 | if layout.name == 'Blank': | 282 | if src_rels is None: |
| 76 | blank_layout = layout | 283 | return |
| 77 | break | ||
| 78 | if blank_layout is None: | ||
| 79 | blank_layout = prs.slide_layouts[6] if len(prs.slide_layouts) > 6 else prs.slide_layouts[0] | ||
| 80 | 284 | ||
| 81 | total = 0 | 285 | new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') |
| 82 | for label, images in image_groups: | 286 | for rel in src_rels: |
| 83 | for img_path in images: | 287 | rtype = rel.get('Type') |
| 84 | slide = prs.slides.add_slide(blank_layout) | 288 | |
| 85 | 289 | if rel.get('TargetMode') == 'External': | |
| 86 | # remove any placeholder shapes from blank layout | 290 | new_rels.append(deepcopy(rel)) |
| 87 | for ph in list(slide.placeholders): | 291 | continue |
| 88 | sp = ph._element | 292 | |
| 89 | sp.getparent().remove(sp) | 293 | src_abs = _resolve(src_part, rel.get('Target')) |
| 90 | 294 | r = deepcopy(rel) | |
| 91 | # add image covering the full slide | 295 | |
| 92 | slide.shapes.add_picture( | 296 | if rtype in _MEDIA_TYPES and src_abs in src.parts: |
| 93 | img_path, | 297 | r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs))) |
| 94 | left=0, | 298 | elif src_abs in src.parts: |
| 95 | top=0, | 299 | r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs))) |
| 96 | width=slide_width_emu, | 300 | |
| 97 | height=slide_height_emu | 301 | new_rels.append(r) |
| 302 | |||
| 303 | tgt.set_rels(tgt_part, new_rels) | ||
| 304 | |||
| 305 | |||
| 306 | def _copy_theme(ctx, tgt, src, src_part): | ||
| 307 | """Copy a theme part (and any media it references) to target.""" | ||
| 308 | if src_part in ctx.themes: | ||
| 309 | return ctx.themes[src_part] | ||
| 310 | if src_part not in src.parts: | ||
| 311 | return src_part | ||
| 312 | |||
| 313 | n = _max_num(tgt.parts, r'ppt/theme/theme(\d+)\.xml') + 1 | ||
| 314 | tgt_part = f'ppt/theme/theme{n}.xml' | ||
| 315 | tgt.parts[tgt_part] = src.parts[src_part] | ||
| 316 | |||
| 317 | ctx.themes[src_part] = tgt_part | ||
| 318 | |||
| 319 | _copy_simple_rels(ctx, tgt, src, src_part, tgt_part) | ||
| 320 | _add_override(tgt, tgt_part, _CT_THEME) | ||
| 321 | |||
| 322 | return tgt_part | ||
| 323 | |||
| 324 | |||
| 325 | def _copy_layout_for_master(ctx, tgt, src, src_part, tgt_master): | ||
| 326 | """Copy a layout as part of copying its parent master (no master recursion).""" | ||
| 327 | if src_part in ctx.layouts: | ||
| 328 | return ctx.layouts[src_part] | ||
| 329 | if src_part not in src.parts: | ||
| 330 | return src_part | ||
| 331 | |||
| 332 | n = _max_num(tgt.parts, r'ppt/slideLayouts/slideLayout(\d+)\.xml') + 1 | ||
| 333 | tgt_part = f'ppt/slideLayouts/slideLayout{n}.xml' | ||
| 334 | tgt.parts[tgt_part] = src.parts[src_part] | ||
| 335 | ctx.layouts[src_part] = tgt_part | ||
| 336 | |||
| 337 | src_rels = src.get_rels(src_part) | ||
| 338 | if src_rels is not None: | ||
| 339 | new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') | ||
| 340 | for rel in src_rels: | ||
| 341 | rtype = rel.get('Type') | ||
| 342 | if rel.get('TargetMode') == 'External': | ||
| 343 | new_rels.append(deepcopy(rel)) | ||
| 344 | continue | ||
| 345 | src_abs = _resolve(src_part, rel.get('Target')) | ||
| 346 | r = deepcopy(rel) | ||
| 347 | if rtype == _RT_MASTER and tgt_master: | ||
| 348 | r.set('Target', _relpath(tgt_part, tgt_master)) | ||
| 349 | elif rtype in _MEDIA_TYPES and src_abs in src.parts: | ||
| 350 | r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs))) | ||
| 351 | elif src_abs in src.parts: | ||
| 352 | r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs))) | ||
| 353 | new_rels.append(r) | ||
| 354 | tgt.set_rels(tgt_part, new_rels) | ||
| 355 | |||
| 356 | _add_override(tgt, tgt_part, _CT_LAYOUT) | ||
| 357 | return tgt_part | ||
| 358 | |||
| 359 | |||
| 360 | def _max_layout_id(pkg): | ||
| 361 | """Find the maximum sldLayoutId 'id' across all masters in the package.""" | ||
| 362 | max_id = 2147483648 | ||
| 363 | for name in pkg.parts: | ||
| 364 | if (name.startswith('ppt/slideMasters/') and name.endswith('.xml') | ||
| 365 | and '/_rels/' not in name): | ||
| 366 | try: | ||
| 367 | root = pkg.xml(name) | ||
| 368 | except Exception: | ||
| 369 | continue | ||
| 370 | for entry in root.iter(f'{{{_NS_P}}}sldLayoutId'): | ||
| 371 | try: | ||
| 372 | max_id = max(max_id, int(entry.get('id', 0))) | ||
| 373 | except (ValueError, TypeError): | ||
| 374 | pass | ||
| 375 | return max_id | ||
| 376 | |||
| 377 | |||
| 378 | def _copy_master(ctx, tgt, src, src_part, pres_xml, pres_rels): | ||
| 379 | """Copy a slide master, all its layouts, theme, and media to target. | ||
| 380 | |||
| 381 | Copies every layout the master references so the master's | ||
| 382 | sldLayoutIdLst stays consistent. Layout IDs are reassigned | ||
| 383 | to be globally unique. | ||
| 384 | """ | ||
| 385 | if src_part in ctx.masters: | ||
| 386 | return ctx.masters[src_part] | ||
| 387 | if src_part not in src.parts: | ||
| 388 | return src_part | ||
| 389 | |||
| 390 | # copy the master's theme first | ||
| 391 | theme_info = src.find_rel(src_part, _RT_THEME) | ||
| 392 | tgt_theme = _copy_theme(ctx, tgt, src, theme_info[1]) if theme_info else None | ||
| 393 | |||
| 394 | # allocate new master name | ||
| 395 | n = _max_num(tgt.parts, r'ppt/slideMasters/slideMaster(\d+)\.xml') + 1 | ||
| 396 | tgt_part = f'ppt/slideMasters/slideMaster{n}.xml' | ||
| 397 | tgt.parts[tgt_part] = src.parts[src_part] | ||
| 398 | |||
| 399 | # register early to prevent re-entry | ||
| 400 | ctx.masters[src_part] = tgt_part | ||
| 401 | |||
| 402 | # rebuild master rels: copy ALL layouts, remap theme + media | ||
| 403 | # preserving original rIds so the master XML body stays consistent | ||
| 404 | src_rels = src.get_rels(src_part) | ||
| 405 | if src_rels is not None: | ||
| 406 | new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') | ||
| 407 | for rel in src_rels: | ||
| 408 | rtype = rel.get('Type') | ||
| 409 | |||
| 410 | if rel.get('TargetMode') == 'External': | ||
| 411 | new_rels.append(deepcopy(rel)) | ||
| 412 | continue | ||
| 413 | |||
| 414 | src_abs = _resolve(src_part, rel.get('Target')) | ||
| 415 | r = deepcopy(rel) | ||
| 416 | |||
| 417 | if rtype == _RT_LAYOUT: | ||
| 418 | tgt_layout = _copy_layout_for_master(ctx, tgt, src, src_abs, tgt_part) | ||
| 419 | r.set('Target', _relpath(tgt_part, tgt_layout)) | ||
| 420 | elif rtype == _RT_THEME and tgt_theme: | ||
| 421 | r.set('Target', _relpath(tgt_part, tgt_theme)) | ||
| 422 | elif rtype in _MEDIA_TYPES and src_abs in src.parts: | ||
| 423 | r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs))) | ||
| 424 | elif src_abs in src.parts: | ||
| 425 | r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs))) | ||
| 426 | |||
| 427 | new_rels.append(r) | ||
| 428 | |||
| 429 | tgt.set_rels(tgt_part, new_rels) | ||
| 430 | |||
| 431 | # reassign sldLayoutId 'id' values to be globally unique | ||
| 432 | max_lid = _max_layout_id(tgt) | ||
| 433 | master_xml = tgt.xml(tgt_part) | ||
| 434 | for entry in master_xml.iter(f'{{{_NS_P}}}sldLayoutId'): | ||
| 435 | max_lid += 1 | ||
| 436 | entry.set('id', str(max_lid)) | ||
| 437 | tgt.set_xml(tgt_part, master_xml) | ||
| 438 | |||
| 439 | _add_override(tgt, tgt_part, _CT_MASTER) | ||
| 440 | |||
| 441 | # add master reference to presentation.xml | ||
| 442 | master_list = pres_xml.find(f'{{{_NS_P}}}sldMasterIdLst') | ||
| 443 | if master_list is None: | ||
| 444 | sld_list = pres_xml.find(f'{{{_NS_P}}}sldIdLst') | ||
| 445 | idx = list(pres_xml).index(sld_list) if sld_list is not None else 0 | ||
| 446 | master_list = etree.Element(f'{{{_NS_P}}}sldMasterIdLst') | ||
| 447 | pres_xml.insert(idx, master_list) | ||
| 448 | |||
| 449 | max_mid = 2147483647 | ||
| 450 | for elem in master_list: | ||
| 451 | max_mid = max(max_mid, int(elem.get('id', 0))) | ||
| 452 | |||
| 453 | rid = f'rId{_next_rid(pres_rels)}' | ||
| 454 | etree.SubElement(pres_rels, f'{{{_PKG_RELS}}}Relationship', | ||
| 455 | Id=rid, Type=_RT_MASTER, | ||
| 456 | Target=f'slideMasters/slideMaster{n}.xml') | ||
| 457 | |||
| 458 | me = etree.SubElement(master_list, f'{{{_NS_P}}}sldMasterId') | ||
| 459 | me.set('id', str(max_mid + 1)) | ||
| 460 | me.set(f'{{{_NS_R}}}id', rid) | ||
| 461 | |||
| 462 | return tgt_part | ||
| 463 | |||
| 464 | |||
| 465 | def _copy_layout(ctx, tgt, src, src_part, pres_xml, pres_rels): | ||
| 466 | """Copy a slide layout and its master+theme chain to target. | ||
| 467 | |||
| 468 | _copy_master copies ALL of a master's layouts, so after calling it | ||
| 469 | the requested layout will normally already be in ctx.layouts. | ||
| 470 | """ | ||
| 471 | if src_part in ctx.layouts: | ||
| 472 | return ctx.layouts[src_part] | ||
| 473 | if src_part not in src.parts: | ||
| 474 | return src_part | ||
| 475 | |||
| 476 | # copying the master will also copy all its layouts (including this one) | ||
| 477 | master_info = src.find_rel(src_part, _RT_MASTER) | ||
| 478 | if master_info: | ||
| 479 | _copy_master(ctx, tgt, src, master_info[1], pres_xml, pres_rels) | ||
| 480 | |||
| 481 | # master should have copied this layout already | ||
| 482 | if src_part in ctx.layouts: | ||
| 483 | return ctx.layouts[src_part] | ||
| 484 | |||
| 485 | # fallback: layout without a master (unusual) — copy directly | ||
| 486 | tgt_master = ctx.masters.get(master_info[1]) if master_info else None | ||
| 487 | return _copy_layout_for_master(ctx, tgt, src, src_part, tgt_master) | ||
| 488 | |||
| 489 | |||
| 490 | def _copy_notes(ctx, tgt, src, src_part, tgt_slide_part): | ||
| 491 | """Copy a notes slide to target, updating its slide back-reference.""" | ||
| 492 | if src_part in ctx.notes: | ||
| 493 | return ctx.notes[src_part] | ||
| 494 | if src_part not in src.parts: | ||
| 495 | return src_part | ||
| 496 | |||
| 497 | n = _max_num(tgt.parts, r'ppt/notesSlides/notesSlide(\d+)\.xml') + 1 | ||
| 498 | tgt_part = f'ppt/notesSlides/notesSlide{n}.xml' | ||
| 499 | tgt.parts[tgt_part] = src.parts[src_part] | ||
| 500 | |||
| 501 | ctx.notes[src_part] = tgt_part | ||
| 502 | |||
| 503 | # rebuild notes rels: update slide back-ref, remap media, handle notesMaster | ||
| 504 | src_rels = src.get_rels(src_part) | ||
| 505 | if src_rels is not None: | ||
| 506 | new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') | ||
| 507 | for rel in src_rels: | ||
| 508 | rtype = rel.get('Type') | ||
| 509 | |||
| 510 | if rel.get('TargetMode') == 'External': | ||
| 511 | new_rels.append(deepcopy(rel)) | ||
| 512 | continue | ||
| 513 | |||
| 514 | src_abs = _resolve(src_part, rel.get('Target')) | ||
| 515 | r = deepcopy(rel) | ||
| 516 | |||
| 517 | if rtype == _RT_SLIDE: | ||
| 518 | # update to point to the new slide | ||
| 519 | r.set('Target', _relpath(tgt_part, tgt_slide_part)) | ||
| 520 | elif rtype == _RT_NOTES_MASTER: | ||
| 521 | # point to target's notesMaster (keep path if it exists in target) | ||
| 522 | if src_abs in tgt.parts: | ||
| 523 | r.set('Target', _relpath(tgt_part, src_abs)) | ||
| 524 | else: | ||
| 525 | # find any notesMaster in target | ||
| 526 | nm = [k for k in tgt.parts | ||
| 527 | if k.startswith('ppt/notesMasters/') | ||
| 528 | and k.endswith('.xml') | ||
| 529 | and '/_rels/' not in k] | ||
| 530 | if nm: | ||
| 531 | r.set('Target', _relpath(tgt_part, nm[0])) | ||
| 532 | else: | ||
| 533 | continue # drop ref if no notesMaster available | ||
| 534 | elif rtype in _MEDIA_TYPES and src_abs in src.parts: | ||
| 535 | r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs))) | ||
| 536 | elif src_abs in src.parts: | ||
| 537 | r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs))) | ||
| 538 | |||
| 539 | new_rels.append(r) | ||
| 540 | |||
| 541 | tgt.set_rels(tgt_part, new_rels) | ||
| 542 | |||
| 543 | _add_override(tgt, tgt_part, _CT_NOTES) | ||
| 544 | |||
| 545 | return tgt_part | ||
| 546 | |||
| 547 | |||
| 548 | # main merge logic | ||
| 549 | |||
| 550 | def _merge_source_into(tgt, src, label): | ||
| 551 | """Merge all slides from a source package into the target package.""" | ||
| 552 | ctx = _MergeCtx() | ||
| 553 | |||
| 554 | # parse source presentation to find its slides | ||
| 555 | src_pres = src.xml('ppt/presentation.xml') | ||
| 556 | src_pres_rels = src.get_rels('ppt/presentation.xml') | ||
| 557 | if src_pres_rels is None: | ||
| 558 | print(f' [!] no presentation rels in {label}', file=sys.stderr) | ||
| 559 | return 0 | ||
| 560 | |||
| 561 | # build rId -> (type, resolved_target) map for source | ||
| 562 | src_rel_map = {} | ||
| 563 | for rel in src_pres_rels: | ||
| 564 | if rel.get('TargetMode') != 'External': | ||
| 565 | src_rel_map[rel.get('Id')] = ( | ||
| 566 | rel.get('Type'), | ||
| 567 | _resolve('ppt/presentation.xml', rel.get('Target')) | ||
| 98 | ) | 568 | ) |
| 99 | total += 1 | ||
| 100 | print(f" [{label}] {len(images)} slides added") | ||
| 101 | 569 | ||
| 102 | prs.save(output_path) | 570 | # enumerate source slides |
| 103 | return total | 571 | sld_list = src_pres.find(f'{{{_NS_P}}}sldIdLst') |
| 572 | if sld_list is None: | ||
| 573 | print(f' [!] no slides in {label}', file=sys.stderr) | ||
| 574 | return 0 | ||
| 575 | |||
| 576 | src_slides = [] | ||
| 577 | for sld_id_elem in sld_list.findall(f'{{{_NS_P}}}sldId'): | ||
| 578 | rid = sld_id_elem.get(f'{{{_NS_R}}}id') | ||
| 579 | if rid in src_rel_map and src_rel_map[rid][0] == _RT_SLIDE: | ||
| 580 | src_slides.append((sld_id_elem.get('id'), rid, src_rel_map[rid][1])) | ||
| 581 | |||
| 582 | if not src_slides: | ||
| 583 | print(f' [!] no slides in {label}', file=sys.stderr) | ||
| 584 | return 0 | ||
| 585 | |||
| 586 | # load target presentation state | ||
| 587 | tgt_pres = tgt.xml('ppt/presentation.xml') | ||
| 588 | tgt_pres_rels = tgt.get_rels('ppt/presentation.xml') | ||
| 589 | if tgt_pres_rels is None: | ||
| 590 | tgt_pres_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') | ||
| 591 | |||
| 592 | tgt_sld_list = tgt_pres.find(f'{{{_NS_P}}}sldIdLst') | ||
| 593 | if tgt_sld_list is None: | ||
| 594 | tgt_sld_list = etree.SubElement(tgt_pres, f'{{{_NS_P}}}sldIdLst') | ||
| 104 | 595 | ||
| 596 | # find max slide ID in target (must be >= 256 per OOXML spec) | ||
| 597 | max_sld_id = 256 | ||
| 598 | for elem in tgt_sld_list.findall(f'{{{_NS_P}}}sldId'): | ||
| 599 | max_sld_id = max(max_sld_id, int(elem.get('id', 0))) | ||
| 105 | 600 | ||
| 106 | def merge_presentations(*pptx_files, output='merged.pptx'): | 601 | # copy each slide |
| 107 | # merge pptx files with zero formatting loss via image rendering. | 602 | count = 0 |
| 108 | if not pptx_files: | 603 | for _, _, src_slide_part in src_slides: |
| 109 | raise ValueError("no files provided") | 604 | if src_slide_part not in src.parts: |
| 605 | print(f' [!] slide part missing: {src_slide_part}', file=sys.stderr) | ||
| 606 | continue | ||
| 110 | 607 | ||
| 111 | image_groups = [] | 608 | # allocate new slide name in target |
| 609 | sn = _max_num(tgt.parts, r'ppt/slides/slide(\d+)\.xml') + 1 | ||
| 610 | tgt_slide = f'ppt/slides/slide{sn}.xml' | ||
| 611 | tgt.parts[tgt_slide] = src.parts[src_slide_part] | ||
| 112 | 612 | ||
| 113 | for pptx_file in pptx_files: | 613 | # rebuild slide rels: remap layout, notes, media, and other parts |
| 114 | label = Path(pptx_file).stem | 614 | src_slide_rels = src.get_rels(src_slide_part) |
| 115 | print(f"[*] rendering: {label}") | 615 | if src_slide_rels is not None: |
| 616 | new_slide_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships') | ||
| 617 | for rel in src_slide_rels: | ||
| 618 | rtype = rel.get('Type') | ||
| 116 | 619 | ||
| 117 | tmpdir = tempfile.mkdtemp(prefix=f'pptx_merge_{label}_') | 620 | if rel.get('TargetMode') == 'External': |
| 118 | try: | 621 | new_slide_rels.append(deepcopy(rel)) |
| 119 | images = pptx_to_images(pptx_file, tmpdir) | 622 | continue |
| 120 | image_groups.append((label, images)) | ||
| 121 | print(f" -> {len(images)} slides rendered at {DPI} DPI") | ||
| 122 | except Exception as e: | ||
| 123 | print(f" [!] error rendering {label}: {e}", file=sys.stderr) | ||
| 124 | raise | ||
| 125 | 623 | ||
| 126 | # get slide dimensions from first presentation | 624 | src_abs = _resolve(src_slide_part, rel.get('Target')) |
| 127 | first_prs = Presentation(pptx_files[0]) | 625 | r = deepcopy(rel) |
| 128 | sw = first_prs.slide_width | ||
| 129 | sh = first_prs.slide_height | ||
| 130 | 626 | ||
| 131 | print(f"[*] building merged presentation...") | 627 | if rtype == _RT_LAYOUT: |
| 132 | total = images_to_pptx(image_groups, output, sw, sh) | 628 | tgt_layout = _copy_layout(ctx, tgt, src, src_abs, tgt_pres, tgt_pres_rels) |
| 133 | print(f"[+] merged {total} slides from {len(pptx_files)} presentations -> {output}") | 629 | r.set('Target', _relpath(tgt_slide, tgt_layout)) |
| 630 | elif rtype == _RT_NOTES: | ||
| 631 | tgt_notes = _copy_notes(ctx, tgt, src, src_abs, tgt_slide) | ||
| 632 | r.set('Target', _relpath(tgt_slide, tgt_notes)) | ||
| 633 | elif rtype in _MEDIA_TYPES and src_abs in src.parts: | ||
| 634 | r.set('Target', _relpath(tgt_slide, _copy_media(ctx, tgt, src, src_abs))) | ||
| 635 | elif src_abs in src.parts: | ||
| 636 | r.set('Target', _relpath(tgt_slide, _copy_generic(ctx, tgt, src, src_abs))) | ||
| 134 | 637 | ||
| 135 | # cleanup temp dirs | 638 | new_slide_rels.append(r) |
| 136 | for label, images in image_groups: | ||
| 137 | if images: | ||
| 138 | tmpdir = os.path.dirname(images[0]) | ||
| 139 | shutil.rmtree(tmpdir, ignore_errors=True) | ||
| 140 | 639 | ||
| 141 | return output | 640 | tgt.set_rels(tgt_slide, new_slide_rels) |
| 142 | 641 | ||
| 642 | # register slide in presentation.xml | ||
| 643 | max_sld_id += 1 | ||
| 644 | rid = f'rId{_next_rid(tgt_pres_rels)}' | ||
| 645 | etree.SubElement(tgt_pres_rels, f'{{{_PKG_RELS}}}Relationship', | ||
| 646 | Id=rid, Type=_RT_SLIDE, | ||
| 647 | Target=f'slides/slide{sn}.xml') | ||
| 143 | 648 | ||
| 144 | if __name__ == "__main__": | 649 | se = etree.SubElement(tgt_sld_list, f'{{{_NS_P}}}sldId') |
| 145 | import argparse | 650 | se.set('id', str(max_sld_id)) |
| 146 | 651 | se.set(f'{{{_NS_R}}}id', rid) | |
| 652 | |||
| 653 | _add_override(tgt, tgt_slide, _CT_SLIDE) | ||
| 654 | count += 1 | ||
| 655 | |||
| 656 | # persist updated presentation.xml and rels | ||
| 657 | tgt.set_xml('ppt/presentation.xml', tgt_pres) | ||
| 658 | tgt.set_rels('ppt/presentation.xml', tgt_pres_rels) | ||
| 659 | |||
| 660 | return count | ||
| 661 | |||
| 662 | |||
| 663 | def _lo_normalize(pptx_path): | ||
| 664 | """Re-save a PPTX through LibreOffice to fix any OOXML spec issues. | ||
| 665 | |||
| 666 | LibreOffice reads the file and writes a clean, fully-compliant PPTX. | ||
| 667 | This eliminates intermittent "repair" prompts in Microsoft PowerPoint | ||
| 668 | caused by subtle structural issues (element ordering, stale refs, etc.). | ||
| 669 | """ | ||
| 670 | if not shutil.which('libreoffice'): | ||
| 671 | print(' [!] libreoffice not found, skipping normalization', file=sys.stderr) | ||
| 672 | return | ||
| 673 | |||
| 674 | tmpdir = tempfile.mkdtemp(prefix='kjandoc_norm_') | ||
| 675 | try: | ||
| 676 | abs_path = os.path.abspath(pptx_path) | ||
| 677 | subprocess.run([ | ||
| 678 | 'libreoffice', '--headless', '--convert-to', 'pptx', | ||
| 679 | '--outdir', tmpdir, abs_path | ||
| 680 | ], check=True, capture_output=True) | ||
| 681 | |||
| 682 | # libreoffice writes to tmpdir with the same stem | ||
| 683 | normalized = os.path.join(tmpdir, os.path.basename(abs_path)) | ||
| 684 | if os.path.exists(normalized): | ||
| 685 | shutil.move(normalized, abs_path) | ||
| 686 | else: | ||
| 687 | # sometimes libreoffice changes the extension casing | ||
| 688 | for f in os.listdir(tmpdir): | ||
| 689 | if f.lower().endswith('.pptx'): | ||
| 690 | shutil.move(os.path.join(tmpdir, f), abs_path) | ||
| 691 | break | ||
| 692 | finally: | ||
| 693 | shutil.rmtree(tmpdir, ignore_errors=True) | ||
| 694 | |||
| 695 | |||
| 696 | def merge_presentations(input_files, output_path): | ||
| 697 | if not input_files: | ||
| 698 | raise ValueError('no input files') | ||
| 699 | |||
| 700 | print(f'[*] base: {Path(input_files[0]).name}') | ||
| 701 | tgt = _Pkg(input_files[0]) | ||
| 702 | |||
| 703 | total = 0 | ||
| 704 | for pptx_path in input_files[1:]: | ||
| 705 | label = Path(pptx_path).name | ||
| 706 | print(f'[*] merging: {label}') | ||
| 707 | src = _Pkg(pptx_path) | ||
| 708 | count = _merge_source_into(tgt, src, label) | ||
| 709 | print(f' -> {count} slides merged') | ||
| 710 | total += count | ||
| 711 | |||
| 712 | tgt.save(output_path) | ||
| 713 | |||
| 714 | # normalize through libreoffice to ensure full OOXML compliance | ||
| 715 | print(f'[*] normalizing...') | ||
| 716 | _lo_normalize(output_path) | ||
| 717 | |||
| 718 | # count slides in first file for the total | ||
| 719 | base = _Pkg(input_files[0]) | ||
| 720 | base_pres = base.xml('ppt/presentation.xml') | ||
| 721 | base_sld_list = base_pres.find(f'{{{_NS_P}}}sldIdLst') | ||
| 722 | base_count = len(base_sld_list) if base_sld_list is not None else 0 | ||
| 723 | total += base_count | ||
| 724 | |||
| 725 | print(f'[+] merged {total} slides from {len(input_files)} presentations -> {output_path}') | ||
| 726 | return output_path | ||
| 727 | |||
| 728 | |||
| 729 | # CLI | ||
| 730 | |||
| 731 | if __name__ == '__main__': | ||
| 147 | parser = argparse.ArgumentParser( | 732 | parser = argparse.ArgumentParser( |
| 148 | description='merge pptx files with perfect formatting preservation via rendering', | 733 | description='merge PPTX presentations preserving full editability', |
| 149 | usage='%(prog)s input1.pptx [input2.pptx ...] -o output.pptx' | 734 | usage='%(prog)s input1.pptx [input2.pptx ...] -o output.pptx', |
| 150 | ) | 735 | ) |
| 151 | parser.add_argument( | 736 | parser.add_argument( |
| 152 | 'inputs', | 737 | 'inputs', metavar='INPUT', nargs='+', |
| 153 | metavar='INPUT', | 738 | help='input PPTX files to merge', |
| 154 | nargs='+', | ||
| 155 | help='input pptx files to merge' | ||
| 156 | ) | 739 | ) |
| 157 | parser.add_argument( | 740 | parser.add_argument( |
| 158 | '-o', '--output', | 741 | '-o', '--output', default='merged.pptx', |
| 159 | default='merged.pptx', | 742 | help='output filename (default: merged.pptx)', |
| 160 | help='output filename (default: merged.pptx)' | ||
| 161 | ) | 743 | ) |
| 162 | parser.add_argument( | ||
| 163 | '--dpi', | ||
| 164 | type=int, | ||
| 165 | default=DPI, | ||
| 166 | help=f'rendering DPI for slide images (default: {DPI})' | ||
| 167 | ) | ||
| 168 | |||
| 169 | args = parser.parse_args() | 744 | args = parser.parse_args() |
| 170 | 745 | ||
| 171 | # validate input files exist | ||
| 172 | for f in args.inputs: | 746 | for f in args.inputs: |
| 173 | if not os.path.exists(f): | 747 | if not os.path.exists(f): |
| 174 | print(f"[!] error: file not found: {f}", file=sys.stderr) | 748 | print(f'[!] error: file not found: {f}', file=sys.stderr) |
| 175 | sys.exit(1) | 749 | sys.exit(1) |
| 176 | if not f.lower().endswith('.pptx'): | 750 | if not f.lower().endswith('.pptx'): |
| 177 | print(f"[!] error: not a pptx file: {f}", file=sys.stderr) | 751 | print(f'[!] error: not a pptx file: {f}', file=sys.stderr) |
| 178 | sys.exit(1) | 752 | sys.exit(1) |
| 179 | 753 | ||
| 180 | # update global DPI if specified | 754 | merge_presentations(args.inputs, args.output) \ No newline at end of file |
| 181 | if args.dpi != DPI: | ||
| 182 | globals()['DPI'] = args.dpi | ||
| 183 | |||
| 184 | merge_presentations(*args.inputs, output=args.output) \ No newline at end of file | ||
diff --git a/src/requirements.txt b/src/requirements.txt index b9c4f5c..8221c37 100644 --- a/src/requirements.txt +++ b/src/requirements.txt | |||
| @@ -1,5 +1 @@ | |||
| 1 | lxml==6.0.2 | lxml==6.0.2 | |
| 2 | pillow==12.1.1 | ||
| 3 | python-pptx==1.0.2 | ||
| 4 | typing_extensions==4.15.0 | ||
| 5 | xlsxwriter==3.2.9 \ No newline at end of file | ||
