summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkj_sh6042026-02-28 14:52:18 -0500
committerkj_sh6042026-02-28 14:52:18 -0500
commitf701bbcb3259cef245d7f45696406543048aa741 (patch)
tree6a7db0d4615362af7809bd537439a83163da5ab3
parentfafc3e29832779b5ccbea8fd21dc9fd5af67de38 (diff)
refactor: use `feat/same-template-concat` rewrite as main "binary"
OOXML is vibe-coded but this was heavily manually reviewed
-rwxr-xr-xsrc/kjandoc842
1 files changed, 706 insertions, 136 deletions
diff --git a/src/kjandoc b/src/kjandoc
index 233b59e..56cc147 100755
--- a/src/kjandoc
+++ b/src/kjandoc
@@ -1,184 +1,754 @@
1#!/usr/bin/env python3 1#!/usr/bin/env python3
2 2
3# merge pptx files using libreoffice slide rendering + python-pptx reconstruction. 3import argparse
4# creative approach: render each slide as a high-res image via libreoffice, 4import hashlib
5# then stitch them into one pptx. this preserves 100% of the visual formatting
6# since we're working with exact raster snapshots of each slide.
7
8import os 5import os
9import sys 6import re
10import glob
11import shutil 7import shutil
12import subprocess 8import subprocess
9import sys
13import tempfile 10import tempfile
11import zipfile
12from copy import deepcopy
14from pathlib import Path 13from pathlib import Path
15from PIL import Image
16from pptx import Presentation
17from pptx.util import Emu
18 14
15from lxml import etree
19 16
20DPI = 300 # high-res export
21 17
18# OOXML
22 19
23def pptx_to_images(pptx_path, output_dir): 20_PKG_RELS = 'http://schemas.openxmlformats.org/package/2006/relationships'
24 # use libreoffice to convert pptx -> pdf, then pdf -> images via pdftoppm. 21_PKG_CT = 'http://schemas.openxmlformats.org/package/2006/content-types'
25 # fallback: libreoffice direct png export if pdftoppm unavailable. 22_NS_P = 'http://schemas.openxmlformats.org/presentationml/2006/main'
26 pptx_path = os.path.abspath(pptx_path) 23_NS_R = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
27 pdf_path = os.path.join(output_dir, Path(pptx_path).stem + '.pdf')
28 24
29 # step 1: pptx -> pdf via libreoffice (preserves all formatting) 25# relationship type URIs
30 subprocess.run([ 26_RT_SLIDE = _NS_R + '/slide'
31 'libreoffice', '--headless', '--convert-to', 'pdf', 27_RT_LAYOUT = _NS_R + '/slideLayout'
32 '--outdir', output_dir, pptx_path 28_RT_MASTER = _NS_R + '/slideMaster'
33 ], check=True, capture_output=True) 29_RT_THEME = _NS_R + '/theme'
30_RT_NOTES = _NS_R + '/notesSlide'
31_RT_NOTES_MASTER = _NS_R + '/notesMaster'
32_RT_IMAGE = _NS_R + '/image'
34 33
35 if not os.path.exists(pdf_path): 34# relationship types that target binary media files
36 raise RuntimeError(f"libreoffice failed to produce {pdf_path}") 35_MEDIA_TYPES = {
36 _RT_IMAGE,
37 _NS_R + '/audio',
38 _NS_R + '/video',
39 'http://schemas.microsoft.com/office/2007/relationships/media',
40 _NS_R + '/oleObject',
41}
37 42
38 # step 2: pdf -> png images 43# content type strings for OOXML parts
39 # try pdftoppm first (from poppler-utils, higher quality) 44_CT_SLIDE = 'application/vnd.openxmlformats-officedocument.presentationml.slide+xml'
40 image_prefix = os.path.join(output_dir, 'slide') 45_CT_LAYOUT = 'application/vnd.openxmlformats-officedocument.presentationml.slideLayout+xml'
46_CT_MASTER = 'application/vnd.openxmlformats-officedocument.presentationml.slideMaster+xml'
47_CT_THEME = 'application/vnd.openxmlformats-officedocument.theme+xml'
48_CT_NOTES = 'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml'
41 49
42 if shutil.which('pdftoppm'): 50# common media MIME types by extension
43 subprocess.run([ 51_MIME = {
44 'pdftoppm', '-png', '-r', str(DPI), 52 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
45 pdf_path, image_prefix 53 'gif': 'image/gif', 'svg': 'image/svg+xml', 'emf': 'image/x-emf',
46 ], check=True, capture_output=True) 54 'wmf': 'image/x-wmf', 'tiff': 'image/tiff', 'tif': 'image/tiff',
55 'bmp': 'image/bmp', 'wdp': 'image/vnd.ms-photo',
56 'mp3': 'audio/mpeg', 'wav': 'audio/wav',
57 'mp4': 'video/mp4', 'm4v': 'video/mp4',
58}
59
60
61# utility functions
62
63def _resolve(base, target):
64 """Resolve a relative relationship target against a base part path.
65
66 >>> _resolve('ppt/slides/slide1.xml', '../slideLayouts/slideLayout1.xml')
67 'ppt/slideLayouts/slideLayout1.xml'
68 """
69 if target.startswith('/'):
70 return target.lstrip('/')
71 return os.path.normpath(os.path.join(os.path.dirname(base), target)).replace('\\', '/')
72
73
74def _relpath(from_part, to_part):
75 """Relative path from one part's directory to another part.
76
77 >>> _relpath('ppt/slides/slide1.xml', 'ppt/slideLayouts/slideLayout2.xml')
78 '../slideLayouts/slideLayout2.xml'
79 """
80 return os.path.relpath(to_part, os.path.dirname(from_part)).replace('\\', '/')
81
82
83def _max_num(names, pattern):
84 """Find highest number captured by group(1) of pattern across names."""
85 mx, rx = 0, re.compile(pattern)
86 for name in names:
87 m = rx.search(name)
88 if m:
89 mx = max(mx, int(m.group(1)))
90 return mx
91
92
93def _next_rid(rels_element):
94 """Next available rId number from a Relationships XML element."""
95 mx = 0
96 for rel in rels_element:
97 m = re.match(r'rId(\d+)', rel.get('Id', ''))
98 if m:
99 mx = max(mx, int(m.group(1)))
100 return mx + 1
101
102
103def _sha256(data):
104 return hashlib.sha256(data).hexdigest()
105
106
107# in-memory PPTX package
108class _Pkg:
109 """In-memory representation of a PPTX file (an OOXML ZIP package)."""
110
111 def __init__(self, path=None):
112 self.parts = {} # partname -> bytes
113 if path:
114 with zipfile.ZipFile(path) as zf:
115 for name in zf.namelist():
116 self.parts[name] = zf.read(name)
117
118 def save(self, path):
119 with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf:
120 for name in sorted(self.parts):
121 zf.writestr(name, self.parts[name])
122
123 def xml(self, partname):
124 """Parse a part's bytes as XML, return lxml Element."""
125 return etree.fromstring(self.parts[partname])
126
127 def set_xml(self, partname, element):
128 """Serialize an lxml Element back into the package."""
129 self.parts[partname] = etree.tostring(
130 element, xml_declaration=True, encoding='UTF-8', standalone=True
131 )
132
133 def rels_path(self, partname):
134 """Return the .rels path for a given part."""
135 d, b = os.path.dirname(partname), os.path.basename(partname)
136 return f'{d}/_rels/{b}.rels'
137
138 def get_rels(self, partname):
139 """Parse the .rels XML for a part, or return None if absent."""
140 rp = self.rels_path(partname)
141 if rp in self.parts:
142 return self.xml(rp)
143 return None
144
145 def set_rels(self, partname, element):
146 """Write the .rels XML for a part."""
147 rp = self.rels_path(partname)
148 self.set_xml(rp, element)
149
150 def find_rel(self, partname, rel_type):
151 """Find first internal relationship of a given type.
152
153 Returns (rId, resolved_target_partname) or None.
154 """
155 root = self.get_rels(partname)
156 if root is None:
157 return None
158 for rel in root:
159 if rel.get('Type') == rel_type and rel.get('TargetMode') != 'External':
160 return rel.get('Id'), _resolve(partname, rel.get('Target'))
161 return None
162
163
164# content type helpers
165
166def _add_override(pkg, partname, content_type):
167 """Add an Override entry to [Content_Types].xml if not already present."""
168 root = pkg.xml('[Content_Types].xml')
169 abs_name = '/' + partname.lstrip('/')
170 for o in root.findall(f'{{{_PKG_CT}}}Override'):
171 if o.get('PartName') == abs_name:
172 return
173 etree.SubElement(root, f'{{{_PKG_CT}}}Override',
174 PartName=abs_name, ContentType=content_type)
175 pkg.set_xml('[Content_Types].xml', root)
176
177
178def _add_default_ext(pkg, ext, content_type):
179 """Add a Default entry for a file extension to [Content_Types].xml."""
180 root = pkg.xml('[Content_Types].xml')
181 for d in root.findall(f'{{{_PKG_CT}}}Default'):
182 if d.get('Extension', '').lower() == ext.lower():
183 return
184 etree.SubElement(root, f'{{{_PKG_CT}}}Default',
185 Extension=ext, ContentType=content_type)
186 pkg.set_xml('[Content_Types].xml', root)
187
188
189def _copy_content_type(tgt, src, src_part, tgt_part):
190 """Copy a content type override from source package to target."""
191 src_root = src.xml('[Content_Types].xml')
192 abs_src = '/' + src_part.lstrip('/')
193 for o in src_root.findall(f'{{{_PKG_CT}}}Override'):
194 if o.get('PartName') == abs_src:
195 _add_override(tgt, tgt_part, o.get('ContentType'))
196 return
197
198
199# merge context
200class _MergeCtx:
201 """Tracks source→target part name mappings during merge of one source file.
202
203 Prevents copying the same source part twice and lets us reuse the target
204 name when multiple slides share the same layout/master/theme/media.
205 """
206 def __init__(self):
207 self.layouts = {} # src partname -> tgt partname
208 self.masters = {} # src partname -> tgt partname
209 self.themes = {} # src partname -> tgt partname
210 self.media = {} # src partname -> tgt partname
211 self.notes = {} # src partname -> tgt partname
212 self.generic = {} # src partname -> tgt partname
213
214
215# part copy functions
216def _copy_media(ctx, tgt, src, src_part):
217 """Copy a media file (image/audio/video) to target, deduplicating by hash."""
218 if src_part in ctx.media:
219 return ctx.media[src_part]
220 if src_part not in src.parts:
221 return src_part
222
223 data = src.parts[src_part]
224 h = _sha256(data)
225
226 # deduplicate: reuse existing media in target with same content
227 for tgt_name, tgt_data in tgt.parts.items():
228 if tgt_name.startswith('ppt/media/') and _sha256(tgt_data) == h:
229 ctx.media[src_part] = tgt_name
230 return tgt_name
231
232 ext = os.path.splitext(src_part)[1]
233 n = _max_num(tgt.parts, r'ppt/media/\w+?(\d+)') + 1
234 tgt_part = f'ppt/media/media{n}{ext}'
235 tgt.parts[tgt_part] = data
236
237 # ensure file extension has a registered MIME type
238 ext_lower = ext.lstrip('.').lower()
239 if ext_lower in _MIME:
240 _add_default_ext(tgt, ext_lower, _MIME[ext_lower])
241
242 ctx.media[src_part] = tgt_part
243 return tgt_part
244
245
246def _copy_generic(ctx, tgt, src, src_part):
247 """Copy a generic part (tags, charts, embeddings, etc.) to target."""
248 if src_part in ctx.generic:
249 return ctx.generic[src_part]
250 if src_part not in src.parts:
251 return src_part
252
253 # allocate a new name with incremented number
254 m = re.match(r'(.+?)(\d+)(\.\w+)$', src_part)
255 if m:
256 prefix, _, ext = m.groups()
257 n = _max_num(tgt.parts, re.escape(prefix) + r'(\d+)' + re.escape(ext)) + 1
258 tgt_part = f'{prefix}{n}{ext}'
259 elif src_part in tgt.parts:
260 base, ext = os.path.splitext(src_part)
261 n = 1
262 while f'{base}_{n}{ext}' in tgt.parts:
263 n += 1
264 tgt_part = f'{base}_{n}{ext}'
47 else: 265 else:
48 # fallback: use libreoffice to export as images directly 266 tgt_part = src_part
49 # this works but pdftoppm gives better quality
50 subprocess.run([
51 'libreoffice', '--headless', '--convert-to', 'png',
52 '--outdir', output_dir, pdf_path
53 ], check=True, capture_output=True)
54 267
55 # collect and sort image files 268 tgt.parts[tgt_part] = src.parts[src_part]
56 images = sorted(glob.glob(os.path.join(output_dir, 'slide-*.png')))
57 if not images:
58 images = sorted(glob.glob(os.path.join(output_dir, '*.png')))
59 269
60 if not images: 270 # register early to prevent infinite recursion from circular rels
61 raise RuntimeError("no slide images produced") 271 ctx.generic[src_part] = tgt_part
62 272
63 return images 273 _copy_content_type(tgt, src, src_part, tgt_part)
274 _copy_simple_rels(ctx, tgt, src, src_part, tgt_part)
64 275
276 return tgt_part
65 277
66def images_to_pptx(image_groups, output_path, slide_width_emu=9144000, slide_height_emu=6858000):
67 # build a pptx from slide images, one image per slide filling the entire area.
68 prs = Presentation()
69 prs.slide_width = slide_width_emu
70 prs.slide_height = slide_height_emu
71 278
72 # use blank layout (index 6 is typically blank) 279def _copy_simple_rels(ctx, tgt, src, src_part, tgt_part):
73 blank_layout = None 280 """Copy relationships for a part, handling media and generic sub-parts."""
74 for layout in prs.slide_layouts: 281 src_rels = src.get_rels(src_part)
75 if layout.name == 'Blank': 282 if src_rels is None:
76 blank_layout = layout 283 return
77 break
78 if blank_layout is None:
79 blank_layout = prs.slide_layouts[6] if len(prs.slide_layouts) > 6 else prs.slide_layouts[0]
80 284
81 total = 0 285 new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
82 for label, images in image_groups: 286 for rel in src_rels:
83 for img_path in images: 287 rtype = rel.get('Type')
84 slide = prs.slides.add_slide(blank_layout) 288
85 289 if rel.get('TargetMode') == 'External':
86 # remove any placeholder shapes from blank layout 290 new_rels.append(deepcopy(rel))
87 for ph in list(slide.placeholders): 291 continue
88 sp = ph._element 292
89 sp.getparent().remove(sp) 293 src_abs = _resolve(src_part, rel.get('Target'))
90 294 r = deepcopy(rel)
91 # add image covering the full slide 295
92 slide.shapes.add_picture( 296 if rtype in _MEDIA_TYPES and src_abs in src.parts:
93 img_path, 297 r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs)))
94 left=0, 298 elif src_abs in src.parts:
95 top=0, 299 r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs)))
96 width=slide_width_emu, 300
97 height=slide_height_emu 301 new_rels.append(r)
302
303 tgt.set_rels(tgt_part, new_rels)
304
305
306def _copy_theme(ctx, tgt, src, src_part):
307 """Copy a theme part (and any media it references) to target."""
308 if src_part in ctx.themes:
309 return ctx.themes[src_part]
310 if src_part not in src.parts:
311 return src_part
312
313 n = _max_num(tgt.parts, r'ppt/theme/theme(\d+)\.xml') + 1
314 tgt_part = f'ppt/theme/theme{n}.xml'
315 tgt.parts[tgt_part] = src.parts[src_part]
316
317 ctx.themes[src_part] = tgt_part
318
319 _copy_simple_rels(ctx, tgt, src, src_part, tgt_part)
320 _add_override(tgt, tgt_part, _CT_THEME)
321
322 return tgt_part
323
324
325def _copy_layout_for_master(ctx, tgt, src, src_part, tgt_master):
326 """Copy a layout as part of copying its parent master (no master recursion)."""
327 if src_part in ctx.layouts:
328 return ctx.layouts[src_part]
329 if src_part not in src.parts:
330 return src_part
331
332 n = _max_num(tgt.parts, r'ppt/slideLayouts/slideLayout(\d+)\.xml') + 1
333 tgt_part = f'ppt/slideLayouts/slideLayout{n}.xml'
334 tgt.parts[tgt_part] = src.parts[src_part]
335 ctx.layouts[src_part] = tgt_part
336
337 src_rels = src.get_rels(src_part)
338 if src_rels is not None:
339 new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
340 for rel in src_rels:
341 rtype = rel.get('Type')
342 if rel.get('TargetMode') == 'External':
343 new_rels.append(deepcopy(rel))
344 continue
345 src_abs = _resolve(src_part, rel.get('Target'))
346 r = deepcopy(rel)
347 if rtype == _RT_MASTER and tgt_master:
348 r.set('Target', _relpath(tgt_part, tgt_master))
349 elif rtype in _MEDIA_TYPES and src_abs in src.parts:
350 r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs)))
351 elif src_abs in src.parts:
352 r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs)))
353 new_rels.append(r)
354 tgt.set_rels(tgt_part, new_rels)
355
356 _add_override(tgt, tgt_part, _CT_LAYOUT)
357 return tgt_part
358
359
360def _max_layout_id(pkg):
361 """Find the maximum sldLayoutId 'id' across all masters in the package."""
362 max_id = 2147483648
363 for name in pkg.parts:
364 if (name.startswith('ppt/slideMasters/') and name.endswith('.xml')
365 and '/_rels/' not in name):
366 try:
367 root = pkg.xml(name)
368 except Exception:
369 continue
370 for entry in root.iter(f'{{{_NS_P}}}sldLayoutId'):
371 try:
372 max_id = max(max_id, int(entry.get('id', 0)))
373 except (ValueError, TypeError):
374 pass
375 return max_id
376
377
378def _copy_master(ctx, tgt, src, src_part, pres_xml, pres_rels):
379 """Copy a slide master, all its layouts, theme, and media to target.
380
381 Copies every layout the master references so the master's
382 sldLayoutIdLst stays consistent. Layout IDs are reassigned
383 to be globally unique.
384 """
385 if src_part in ctx.masters:
386 return ctx.masters[src_part]
387 if src_part not in src.parts:
388 return src_part
389
390 # copy the master's theme first
391 theme_info = src.find_rel(src_part, _RT_THEME)
392 tgt_theme = _copy_theme(ctx, tgt, src, theme_info[1]) if theme_info else None
393
394 # allocate new master name
395 n = _max_num(tgt.parts, r'ppt/slideMasters/slideMaster(\d+)\.xml') + 1
396 tgt_part = f'ppt/slideMasters/slideMaster{n}.xml'
397 tgt.parts[tgt_part] = src.parts[src_part]
398
399 # register early to prevent re-entry
400 ctx.masters[src_part] = tgt_part
401
402 # rebuild master rels: copy ALL layouts, remap theme + media
403 # preserving original rIds so the master XML body stays consistent
404 src_rels = src.get_rels(src_part)
405 if src_rels is not None:
406 new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
407 for rel in src_rels:
408 rtype = rel.get('Type')
409
410 if rel.get('TargetMode') == 'External':
411 new_rels.append(deepcopy(rel))
412 continue
413
414 src_abs = _resolve(src_part, rel.get('Target'))
415 r = deepcopy(rel)
416
417 if rtype == _RT_LAYOUT:
418 tgt_layout = _copy_layout_for_master(ctx, tgt, src, src_abs, tgt_part)
419 r.set('Target', _relpath(tgt_part, tgt_layout))
420 elif rtype == _RT_THEME and tgt_theme:
421 r.set('Target', _relpath(tgt_part, tgt_theme))
422 elif rtype in _MEDIA_TYPES and src_abs in src.parts:
423 r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs)))
424 elif src_abs in src.parts:
425 r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs)))
426
427 new_rels.append(r)
428
429 tgt.set_rels(tgt_part, new_rels)
430
431 # reassign sldLayoutId 'id' values to be globally unique
432 max_lid = _max_layout_id(tgt)
433 master_xml = tgt.xml(tgt_part)
434 for entry in master_xml.iter(f'{{{_NS_P}}}sldLayoutId'):
435 max_lid += 1
436 entry.set('id', str(max_lid))
437 tgt.set_xml(tgt_part, master_xml)
438
439 _add_override(tgt, tgt_part, _CT_MASTER)
440
441 # add master reference to presentation.xml
442 master_list = pres_xml.find(f'{{{_NS_P}}}sldMasterIdLst')
443 if master_list is None:
444 sld_list = pres_xml.find(f'{{{_NS_P}}}sldIdLst')
445 idx = list(pres_xml).index(sld_list) if sld_list is not None else 0
446 master_list = etree.Element(f'{{{_NS_P}}}sldMasterIdLst')
447 pres_xml.insert(idx, master_list)
448
449 max_mid = 2147483647
450 for elem in master_list:
451 max_mid = max(max_mid, int(elem.get('id', 0)))
452
453 rid = f'rId{_next_rid(pres_rels)}'
454 etree.SubElement(pres_rels, f'{{{_PKG_RELS}}}Relationship',
455 Id=rid, Type=_RT_MASTER,
456 Target=f'slideMasters/slideMaster{n}.xml')
457
458 me = etree.SubElement(master_list, f'{{{_NS_P}}}sldMasterId')
459 me.set('id', str(max_mid + 1))
460 me.set(f'{{{_NS_R}}}id', rid)
461
462 return tgt_part
463
464
465def _copy_layout(ctx, tgt, src, src_part, pres_xml, pres_rels):
466 """Copy a slide layout and its master+theme chain to target.
467
468 _copy_master copies ALL of a master's layouts, so after calling it
469 the requested layout will normally already be in ctx.layouts.
470 """
471 if src_part in ctx.layouts:
472 return ctx.layouts[src_part]
473 if src_part not in src.parts:
474 return src_part
475
476 # copying the master will also copy all its layouts (including this one)
477 master_info = src.find_rel(src_part, _RT_MASTER)
478 if master_info:
479 _copy_master(ctx, tgt, src, master_info[1], pres_xml, pres_rels)
480
481 # master should have copied this layout already
482 if src_part in ctx.layouts:
483 return ctx.layouts[src_part]
484
485 # fallback: layout without a master (unusual) — copy directly
486 tgt_master = ctx.masters.get(master_info[1]) if master_info else None
487 return _copy_layout_for_master(ctx, tgt, src, src_part, tgt_master)
488
489
490def _copy_notes(ctx, tgt, src, src_part, tgt_slide_part):
491 """Copy a notes slide to target, updating its slide back-reference."""
492 if src_part in ctx.notes:
493 return ctx.notes[src_part]
494 if src_part not in src.parts:
495 return src_part
496
497 n = _max_num(tgt.parts, r'ppt/notesSlides/notesSlide(\d+)\.xml') + 1
498 tgt_part = f'ppt/notesSlides/notesSlide{n}.xml'
499 tgt.parts[tgt_part] = src.parts[src_part]
500
501 ctx.notes[src_part] = tgt_part
502
503 # rebuild notes rels: update slide back-ref, remap media, handle notesMaster
504 src_rels = src.get_rels(src_part)
505 if src_rels is not None:
506 new_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
507 for rel in src_rels:
508 rtype = rel.get('Type')
509
510 if rel.get('TargetMode') == 'External':
511 new_rels.append(deepcopy(rel))
512 continue
513
514 src_abs = _resolve(src_part, rel.get('Target'))
515 r = deepcopy(rel)
516
517 if rtype == _RT_SLIDE:
518 # update to point to the new slide
519 r.set('Target', _relpath(tgt_part, tgt_slide_part))
520 elif rtype == _RT_NOTES_MASTER:
521 # point to target's notesMaster (keep path if it exists in target)
522 if src_abs in tgt.parts:
523 r.set('Target', _relpath(tgt_part, src_abs))
524 else:
525 # find any notesMaster in target
526 nm = [k for k in tgt.parts
527 if k.startswith('ppt/notesMasters/')
528 and k.endswith('.xml')
529 and '/_rels/' not in k]
530 if nm:
531 r.set('Target', _relpath(tgt_part, nm[0]))
532 else:
533 continue # drop ref if no notesMaster available
534 elif rtype in _MEDIA_TYPES and src_abs in src.parts:
535 r.set('Target', _relpath(tgt_part, _copy_media(ctx, tgt, src, src_abs)))
536 elif src_abs in src.parts:
537 r.set('Target', _relpath(tgt_part, _copy_generic(ctx, tgt, src, src_abs)))
538
539 new_rels.append(r)
540
541 tgt.set_rels(tgt_part, new_rels)
542
543 _add_override(tgt, tgt_part, _CT_NOTES)
544
545 return tgt_part
546
547
548# main merge logic
549
550def _merge_source_into(tgt, src, label):
551 """Merge all slides from a source package into the target package."""
552 ctx = _MergeCtx()
553
554 # parse source presentation to find its slides
555 src_pres = src.xml('ppt/presentation.xml')
556 src_pres_rels = src.get_rels('ppt/presentation.xml')
557 if src_pres_rels is None:
558 print(f' [!] no presentation rels in {label}', file=sys.stderr)
559 return 0
560
561 # build rId -> (type, resolved_target) map for source
562 src_rel_map = {}
563 for rel in src_pres_rels:
564 if rel.get('TargetMode') != 'External':
565 src_rel_map[rel.get('Id')] = (
566 rel.get('Type'),
567 _resolve('ppt/presentation.xml', rel.get('Target'))
98 ) 568 )
99 total += 1
100 print(f" [{label}] {len(images)} slides added")
101 569
102 prs.save(output_path) 570 # enumerate source slides
103 return total 571 sld_list = src_pres.find(f'{{{_NS_P}}}sldIdLst')
572 if sld_list is None:
573 print(f' [!] no slides in {label}', file=sys.stderr)
574 return 0
575
576 src_slides = []
577 for sld_id_elem in sld_list.findall(f'{{{_NS_P}}}sldId'):
578 rid = sld_id_elem.get(f'{{{_NS_R}}}id')
579 if rid in src_rel_map and src_rel_map[rid][0] == _RT_SLIDE:
580 src_slides.append((sld_id_elem.get('id'), rid, src_rel_map[rid][1]))
581
582 if not src_slides:
583 print(f' [!] no slides in {label}', file=sys.stderr)
584 return 0
585
586 # load target presentation state
587 tgt_pres = tgt.xml('ppt/presentation.xml')
588 tgt_pres_rels = tgt.get_rels('ppt/presentation.xml')
589 if tgt_pres_rels is None:
590 tgt_pres_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
591
592 tgt_sld_list = tgt_pres.find(f'{{{_NS_P}}}sldIdLst')
593 if tgt_sld_list is None:
594 tgt_sld_list = etree.SubElement(tgt_pres, f'{{{_NS_P}}}sldIdLst')
104 595
596 # find max slide ID in target (must be >= 256 per OOXML spec)
597 max_sld_id = 256
598 for elem in tgt_sld_list.findall(f'{{{_NS_P}}}sldId'):
599 max_sld_id = max(max_sld_id, int(elem.get('id', 0)))
105 600
106def merge_presentations(*pptx_files, output='merged.pptx'): 601 # copy each slide
107 # merge pptx files with zero formatting loss via image rendering. 602 count = 0
108 if not pptx_files: 603 for _, _, src_slide_part in src_slides:
109 raise ValueError("no files provided") 604 if src_slide_part not in src.parts:
605 print(f' [!] slide part missing: {src_slide_part}', file=sys.stderr)
606 continue
110 607
111 image_groups = [] 608 # allocate new slide name in target
609 sn = _max_num(tgt.parts, r'ppt/slides/slide(\d+)\.xml') + 1
610 tgt_slide = f'ppt/slides/slide{sn}.xml'
611 tgt.parts[tgt_slide] = src.parts[src_slide_part]
112 612
113 for pptx_file in pptx_files: 613 # rebuild slide rels: remap layout, notes, media, and other parts
114 label = Path(pptx_file).stem 614 src_slide_rels = src.get_rels(src_slide_part)
115 print(f"[*] rendering: {label}") 615 if src_slide_rels is not None:
616 new_slide_rels = etree.Element(f'{{{_PKG_RELS}}}Relationships')
617 for rel in src_slide_rels:
618 rtype = rel.get('Type')
116 619
117 tmpdir = tempfile.mkdtemp(prefix=f'pptx_merge_{label}_') 620 if rel.get('TargetMode') == 'External':
118 try: 621 new_slide_rels.append(deepcopy(rel))
119 images = pptx_to_images(pptx_file, tmpdir) 622 continue
120 image_groups.append((label, images))
121 print(f" -> {len(images)} slides rendered at {DPI} DPI")
122 except Exception as e:
123 print(f" [!] error rendering {label}: {e}", file=sys.stderr)
124 raise
125 623
126 # get slide dimensions from first presentation 624 src_abs = _resolve(src_slide_part, rel.get('Target'))
127 first_prs = Presentation(pptx_files[0]) 625 r = deepcopy(rel)
128 sw = first_prs.slide_width
129 sh = first_prs.slide_height
130 626
131 print(f"[*] building merged presentation...") 627 if rtype == _RT_LAYOUT:
132 total = images_to_pptx(image_groups, output, sw, sh) 628 tgt_layout = _copy_layout(ctx, tgt, src, src_abs, tgt_pres, tgt_pres_rels)
133 print(f"[+] merged {total} slides from {len(pptx_files)} presentations -> {output}") 629 r.set('Target', _relpath(tgt_slide, tgt_layout))
630 elif rtype == _RT_NOTES:
631 tgt_notes = _copy_notes(ctx, tgt, src, src_abs, tgt_slide)
632 r.set('Target', _relpath(tgt_slide, tgt_notes))
633 elif rtype in _MEDIA_TYPES and src_abs in src.parts:
634 r.set('Target', _relpath(tgt_slide, _copy_media(ctx, tgt, src, src_abs)))
635 elif src_abs in src.parts:
636 r.set('Target', _relpath(tgt_slide, _copy_generic(ctx, tgt, src, src_abs)))
134 637
135 # cleanup temp dirs 638 new_slide_rels.append(r)
136 for label, images in image_groups:
137 if images:
138 tmpdir = os.path.dirname(images[0])
139 shutil.rmtree(tmpdir, ignore_errors=True)
140 639
141 return output 640 tgt.set_rels(tgt_slide, new_slide_rels)
142 641
642 # register slide in presentation.xml
643 max_sld_id += 1
644 rid = f'rId{_next_rid(tgt_pres_rels)}'
645 etree.SubElement(tgt_pres_rels, f'{{{_PKG_RELS}}}Relationship',
646 Id=rid, Type=_RT_SLIDE,
647 Target=f'slides/slide{sn}.xml')
143 648
144if __name__ == "__main__": 649 se = etree.SubElement(tgt_sld_list, f'{{{_NS_P}}}sldId')
145 import argparse 650 se.set('id', str(max_sld_id))
146 651 se.set(f'{{{_NS_R}}}id', rid)
652
653 _add_override(tgt, tgt_slide, _CT_SLIDE)
654 count += 1
655
656 # persist updated presentation.xml and rels
657 tgt.set_xml('ppt/presentation.xml', tgt_pres)
658 tgt.set_rels('ppt/presentation.xml', tgt_pres_rels)
659
660 return count
661
662
663def _lo_normalize(pptx_path):
664 """Re-save a PPTX through LibreOffice to fix any OOXML spec issues.
665
666 LibreOffice reads the file and writes a clean, fully-compliant PPTX.
667 This eliminates intermittent "repair" prompts in Microsoft PowerPoint
668 caused by subtle structural issues (element ordering, stale refs, etc.).
669 """
670 if not shutil.which('libreoffice'):
671 print(' [!] libreoffice not found, skipping normalization', file=sys.stderr)
672 return
673
674 tmpdir = tempfile.mkdtemp(prefix='kjandoc_norm_')
675 try:
676 abs_path = os.path.abspath(pptx_path)
677 subprocess.run([
678 'libreoffice', '--headless', '--convert-to', 'pptx',
679 '--outdir', tmpdir, abs_path
680 ], check=True, capture_output=True)
681
682 # libreoffice writes to tmpdir with the same stem
683 normalized = os.path.join(tmpdir, os.path.basename(abs_path))
684 if os.path.exists(normalized):
685 shutil.move(normalized, abs_path)
686 else:
687 # sometimes libreoffice changes the extension casing
688 for f in os.listdir(tmpdir):
689 if f.lower().endswith('.pptx'):
690 shutil.move(os.path.join(tmpdir, f), abs_path)
691 break
692 finally:
693 shutil.rmtree(tmpdir, ignore_errors=True)
694
695
696def merge_presentations(input_files, output_path):
697 if not input_files:
698 raise ValueError('no input files')
699
700 print(f'[*] base: {Path(input_files[0]).name}')
701 tgt = _Pkg(input_files[0])
702
703 total = 0
704 for pptx_path in input_files[1:]:
705 label = Path(pptx_path).name
706 print(f'[*] merging: {label}')
707 src = _Pkg(pptx_path)
708 count = _merge_source_into(tgt, src, label)
709 print(f' -> {count} slides merged')
710 total += count
711
712 tgt.save(output_path)
713
714 # normalize through libreoffice to ensure full OOXML compliance
715 print(f'[*] normalizing...')
716 _lo_normalize(output_path)
717
718 # count slides in first file for the total
719 base = _Pkg(input_files[0])
720 base_pres = base.xml('ppt/presentation.xml')
721 base_sld_list = base_pres.find(f'{{{_NS_P}}}sldIdLst')
722 base_count = len(base_sld_list) if base_sld_list is not None else 0
723 total += base_count
724
725 print(f'[+] merged {total} slides from {len(input_files)} presentations -> {output_path}')
726 return output_path
727
728
729# CLI
730
731if __name__ == '__main__':
147 parser = argparse.ArgumentParser( 732 parser = argparse.ArgumentParser(
148 description='merge pptx files with perfect formatting preservation via rendering', 733 description='merge PPTX presentations preserving full editability',
149 usage='%(prog)s input1.pptx [input2.pptx ...] -o output.pptx' 734 usage='%(prog)s input1.pptx [input2.pptx ...] -o output.pptx',
150 ) 735 )
151 parser.add_argument( 736 parser.add_argument(
152 'inputs', 737 'inputs', metavar='INPUT', nargs='+',
153 metavar='INPUT', 738 help='input PPTX files to merge',
154 nargs='+',
155 help='input pptx files to merge'
156 ) 739 )
157 parser.add_argument( 740 parser.add_argument(
158 '-o', '--output', 741 '-o', '--output', default='merged.pptx',
159 default='merged.pptx', 742 help='output filename (default: merged.pptx)',
160 help='output filename (default: merged.pptx)'
161 ) 743 )
162 parser.add_argument(
163 '--dpi',
164 type=int,
165 default=DPI,
166 help=f'rendering DPI for slide images (default: {DPI})'
167 )
168
169 args = parser.parse_args() 744 args = parser.parse_args()
170 745
171 # validate input files exist
172 for f in args.inputs: 746 for f in args.inputs:
173 if not os.path.exists(f): 747 if not os.path.exists(f):
174 print(f"[!] error: file not found: {f}", file=sys.stderr) 748 print(f'[!] error: file not found: {f}', file=sys.stderr)
175 sys.exit(1) 749 sys.exit(1)
176 if not f.lower().endswith('.pptx'): 750 if not f.lower().endswith('.pptx'):
177 print(f"[!] error: not a pptx file: {f}", file=sys.stderr) 751 print(f'[!] error: not a pptx file: {f}', file=sys.stderr)
178 sys.exit(1) 752 sys.exit(1)
179 753
180 # update global DPI if specified 754 merge_presentations(args.inputs, args.output) \ No newline at end of file
181 if args.dpi != DPI:
182 globals()['DPI'] = args.dpi
183
184 merge_presentations(*args.inputs, output=args.output) \ No newline at end of file