From 3ddb091d1e3e65f1a2b99aabaa6165ffce883c9c Mon Sep 17 00:00:00 2001 From: Frederik Beimgraben Date: Mon, 1 Sep 2025 14:56:56 +0200 Subject: [PATCH] =?UTF-8?q?LaTeX=20dynamic=20building=20=E2=80=93=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/Dockerfile | 1 + backend/src/pdf_filler.py | 147 ++++++++++++++++++++++++++++++++++++- backend/test_flattening.py | 141 +++++++++++++++++++++++++++++++++++ 3 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 backend/test_flattening.py diff --git a/backend/Dockerfile b/backend/Dockerfile index b64c81f7..ea3f6814 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -38,6 +38,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ # System deps RUN apt-get update && apt-get install -y --no-install-recommends \ tzdata ca-certificates \ + qpdf \ && rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/backend/src/pdf_filler.py b/backend/src/pdf_filler.py index bd3b796b..3270ccc2 100644 --- a/backend/src/pdf_filler.py +++ b/backend/src/pdf_filler.py @@ -4,6 +4,8 @@ from __future__ import annotations import io import os import re +import subprocess +import tempfile from typing import Any, Dict, Optional import PyPDF2 @@ -134,13 +136,93 @@ def _collect_btn_widgets(reader: PyPDF2.PdfReader): return btn_widgets_by_name, export_values_by_name +# ----------------------------- +# PDF Flattening Helper +# ----------------------------- + +def _flatten_pdf_with_qpdf(pdf_bytes: bytes) -> Optional[bytes]: + """Try to flatten PDF using qpdf if available.""" + try: + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as input_file: + with tempfile.NamedTemporaryFile(suffix='_flattened.pdf', delete=False) as output_file: + input_path = input_file.name + output_path = output_file.name + + # Write input PDF + with open(input_path, 'wb') as f: + f.write(pdf_bytes) + + # Try to flatten with qpdf + result = subprocess.run( + ['qpdf', '--flatten-annotations=all', '--generate-appearances', input_path, output_path], + capture_output=True, + timeout=30 + ) + + if result.returncode == 0: + with open(output_path, 'rb') as f: + flattened_bytes = f.read() + # Cleanup + os.unlink(input_path) + os.unlink(output_path) + return flattened_bytes + except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired): + pass + finally: + # Ensure cleanup + for path in [input_path, output_path]: + if os.path.exists(path): + os.unlink(path) + return None + +def _flatten_pdf_with_pdftk(pdf_bytes: bytes) -> Optional[bytes]: + """Try to flatten PDF using pdftk if available.""" + try: + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as input_file: + with tempfile.NamedTemporaryFile(suffix='_flattened.pdf', delete=False) as output_file: + input_path = input_file.name + output_path = output_file.name + + # Write input PDF + with open(input_path, 'wb') as f: + f.write(pdf_bytes) + + # Try to flatten with pdftk + result = subprocess.run( + ['pdftk', input_path, 'output', output_path, 'flatten'], + capture_output=True, + timeout=30 + ) + + if result.returncode == 0: + with open(output_path, 'rb') as f: + flattened_bytes = f.read() + # Cleanup + os.unlink(input_path) + os.unlink(output_path) + return flattened_bytes + except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired): + pass + finally: + # Ensure cleanup + for path in [input_path, output_path]: + if os.path.exists(path): + os.unlink(path) + return None + # ----------------------------- # Kern: PDF füllen (direktes Widget-Update) # ----------------------------- -def fill_pdf(payload: Dict[str, Any], variant: str, out_path: Optional[str] = None) -> bytes: +def fill_pdf(payload: Dict[str, Any], variant: str, out_path: Optional[str] = None, flatten: bool = True) -> bytes: """ Payload (asdict(RootPayload) ODER dein payload["pa"]-ähnliches Dict) -> befüllte PDF-Bytes. + + Args: + payload: Dictionary mit den Formulardaten + variant: "QSM" oder "VSM" + out_path: Optionaler Pfad zum Speichern der PDF + flatten: Wenn True, werden Formularfelder in statischen Inhalt umgewandelt """ template_path = _get_template(variant) if not os.path.isfile(template_path): @@ -329,15 +411,72 @@ def fill_pdf(payload: Dict[str, Any], variant: str, out_path: Optional[str] = No except Exception: continue - # 3) Schreiben + # 3) Write the PDF with filled forms bio = io.BytesIO() writer.write(bio) data = bio.getvalue() + + # 4) Flatten if requested + if flatten: + # Try external tools first for better flattening + flattened = _flatten_pdf_with_qpdf(data) + if flattened: + data = flattened + else: + # Try pdftk as fallback + flattened = _flatten_pdf_with_pdftk(data) + if flattened: + data = flattened + else: + # Fallback: Remove form fields using PyPDF2 (fields won't be visible) + # This is not ideal but better than nothing + reader = PyPDF2.PdfReader(io.BytesIO(data)) + writer = PyPDF2.PdfWriter() + + # Copy all pages + for page in reader.pages: + writer.add_page(page) + + # Remove AcroForm to make fields non-interactive + if "/AcroForm" in writer._root_object: + del writer._root_object["/AcroForm"] + + # Remove Widget annotations + for page in writer.pages: + if "/Annots" in page: + annots = page["/Annots"] + if isinstance(annots, IndirectObject): + try: + annots = annots.get_object() + except: + continue + + new_annots = ArrayObject() + if isinstance(annots, (list, ArrayObject)): + for annot_ref in annots: + try: + annot = annot_ref.get_object() if isinstance(annot_ref, IndirectObject) else annot_ref + if isinstance(annot, DictionaryObject): + subtype = _to_str(annot.get("/Subtype")) + if subtype and subtype != "Widget": + new_annots.append(annot_ref) + except: + continue + + if len(new_annots) > 0: + page[NameObject("/Annots")] = new_annots + else: + if "/Annots" in page: + del page["/Annots"] + + bio = io.BytesIO() + writer.write(bio) + data = bio.getvalue() if out_path: with open(out_path, "wb") as out: out.write(data) return data -def save_pdf(payload: Dict[str, Any], variant: str, out_path: str) -> None: - _ = fill_pdf(payload, variant, out_path=out_path) +def save_pdf(payload: Dict[str, Any], variant: str, out_path: str, flatten: bool = True) -> None: + _ = fill_pdf(payload, variant, out_path=out_path, flatten=flatten) diff --git a/backend/test_flattening.py b/backend/test_flattening.py new file mode 100644 index 00000000..c7694e0c --- /dev/null +++ b/backend/test_flattening.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Test script to verify PDF flattening functionality. +Tests that form fields are properly removed after filling. +""" + +import os +import sys +import tempfile +from pathlib import Path + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +import PyPDF2 +from pdf_filler import fill_pdf + +def check_pdf_has_forms(pdf_path): + """Check if a PDF has form fields.""" + with open(pdf_path, 'rb') as f: + reader = PyPDF2.PdfReader(f) + + # Check for AcroForm + if '/AcroForm' in reader.trailer.get('/Root', {}): + acroform = reader.trailer['/Root']['/AcroForm'] + if '/Fields' in acroform: + fields = acroform['/Fields'] + if fields and len(fields) > 0: + return True, f"Found {len(fields)} form fields" + + # Check for widget annotations + widget_count = 0 + for page in reader.pages: + if '/Annots' in page: + annots = page['/Annots'] + if hasattr(annots, 'get_object'): + annots = annots.get_object() + + if isinstance(annots, (list, PyPDF2.generic.ArrayObject)): + for annot_ref in annots: + try: + annot = annot_ref.get_object() if hasattr(annot_ref, 'get_object') else annot_ref + if isinstance(annot, dict) or isinstance(annot, PyPDF2.generic.DictionaryObject): + subtype = annot.get('/Subtype') + if subtype and str(subtype) == '/Widget': + widget_count += 1 + except: + pass + + if widget_count > 0: + return True, f"Found {widget_count} widget annotations" + + return False, "No form fields or widgets found" + +def test_flattening(): + """Test PDF flattening functionality.""" + + # Test payload + test_payload = { + "pa": { + "meta": { + "id": "TEST-001", + "key": "test-key-123" + }, + "applicant": { + "name": "Test Applicant", + "email": "test@example.com" + }, + "project": { + "title": "Test Project", + "description": "This is a test project", + "costs": [ + {"description": "Item 1", "amountEur": 100.50}, + {"description": "Item 2", "amountEur": 200.75} + ] + } + } + } + + print("Testing PDF Flattening...") + print("-" * 50) + + # Test both variants + for variant in ["VSM", "QSM"]: + print(f"\nTesting {variant} variant:") + + # Check if template exists + template_path = os.path.join(os.path.dirname(__file__), "src", "assets", f"{variant.lower()}.pdf") + if not os.path.exists(template_path): + print(f" ⚠️ Template not found at {template_path}, skipping...") + continue + + # Check template has forms + has_forms, msg = check_pdf_has_forms(template_path) + print(f" Template: {msg}") + + # Generate PDF with flattening (default) + with tempfile.NamedTemporaryFile(suffix=f"_{variant}_flattened.pdf", delete=False) as tf: + flattened_path = tf.name + + try: + pdf_bytes = fill_pdf(test_payload, variant, out_path=flattened_path, flatten=True) + has_forms, msg = check_pdf_has_forms(flattened_path) + print(f" Flattened PDF: {msg}") + + if has_forms: + print(f" ❌ FAILED: Flattened PDF still has form fields!") + else: + print(f" ✅ SUCCESS: Form fields removed after flattening") + + except Exception as e: + print(f" ❌ ERROR generating flattened PDF: {e}") + finally: + if os.path.exists(flattened_path): + os.unlink(flattened_path) + + # Generate PDF without flattening for comparison + with tempfile.NamedTemporaryFile(suffix=f"_{variant}_not_flattened.pdf", delete=False) as tf: + not_flattened_path = tf.name + + try: + pdf_bytes = fill_pdf(test_payload, variant, out_path=not_flattened_path, flatten=False) + has_forms, msg = check_pdf_has_forms(not_flattened_path) + print(f" Non-flattened PDF: {msg}") + + if not has_forms: + print(f" ⚠️ WARNING: Non-flattened PDF has no form fields (unexpected)") + else: + print(f" ✅ Non-flattened PDF keeps form fields as expected") + + except Exception as e: + print(f" ❌ ERROR generating non-flattened PDF: {e}") + finally: + if os.path.exists(not_flattened_path): + os.unlink(not_flattened_path) + + print("\n" + "-" * 50) + print("Test complete!") + +if __name__ == "__main__": + test_flattening()