Working mapper

This commit is contained in:
Frederik Beimgraben 2025-08-31 17:46:13 +02:00
parent e1fb0936fc
commit 34589826e5
3 changed files with 621 additions and 0 deletions

3
.gitignore vendored
View File

@ -160,3 +160,6 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
# PDF
*.pdf
!assets/*.pdf

133
pdf_field_mapping.py Normal file
View File

@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
PDF Field Mapping
This module provides a mapping of PDF fields to their corresponding keys in the application data.
"""
_PLACEHOLDER_VALUES: set = {None, "", "-", "JJJJ-MM", "/\\Fld@default "}
# --- COMMON fields (shared across variants) ---
TEXT_MAPPING_COMMON: dict = {
# Applicant
'pa-applicant-type': {
'required': True,
'target-key': 'pa.applicant.type',
'type': 'enum',
'values': [('person', 'Person'), ('institution', 'Institution')]
},
'pa-institution-type': {
'required': True,
'target-key': 'pa.applicant.institution.type',
'type': 'enum',
'values': [
('-', '-'),
('stud-fs', 'Fachschaft'),
('stud-rf', 'STUPA-Referat'),
('stud-hg', 'Studentische Hochschulgruppe'),
('faculty', 'Fakultät'),
('hs-institution', 'Hochschuleinrichtung'),
]
},
'pa-institution': {'required': True, 'target-key': 'pa.applicant.institution.name', 'type': str},
'pa-first-name': {'required': True, 'target-key': 'pa.applicant.name.first'},
'pa-last-name': {'required': True, 'target-key': 'pa.applicant.name.last'},
'pa-email': {'required': True, 'target-key': 'pa.applicant.contact.email'},
'pa-phone': {'required': False, 'target-key': 'pa.applicant.contact.phone'},
'pa-course': {
'required': True,
'target-key': 'pa.applicant.course',
'type': 'enum',
'values': [('-', '-'), ('INF', 'INF'), ('ESB', 'ESB'), ('LS', 'LS'), ('TEC', 'TEC'), ('TEX', 'TEX'), ('NXT', 'NXT')]
},
'pa-role': {
'required': True,
'target-key': 'pa.applicant.role',
'type': 'enum',
'values': [
('-', '-'),
('Student', 'Student'),
('Professor', 'Professor'),
('Mitarbeiter', 'Mitarbeiter'),
('ASTA', 'ASTA'),
('Referatsleitung', 'Referatsleitung'),
('Fachschaftsvorstand', 'Fachschaftsvorstand'),
]
},
# Project core
'pa-project-name': {'required': True, 'target-key': 'pa.project.name', 'type': str},
'pa-start-date': {'required': True, 'target-key': 'pa.project.dates.start', 'type': str},
'pa-end-date': {'required': False, 'target-key': 'pa.project.dates.end', 'type': str},
'pa-participants': {'required': False, 'target-key': 'pa.project.participants', 'type': int},
'pa-project-description': {'required': True, 'target-key': 'pa.project.description', 'type': str},
# Participation (checkboxes)
'pa-participating-faculties-inf': {'required': False, 'target-key': 'pa.project.participation.faculties.inf', 'type': bool},
'pa-participating-faculties-esb': {'required': False, 'target-key': 'pa.project.participation.faculties.esb', 'type': bool},
'pa-participating-faculties-ls': {'required': False, 'target-key': 'pa.project.participation.faculties.ls', 'type': bool},
'pa-participating-faculties-tec': {'required': False, 'target-key': 'pa.project.participation.faculties.tec', 'type': bool},
'pa-participating-faculties-tex': {'required': False, 'target-key': 'pa.project.participation.faculties.tex', 'type': bool},
'pa-participating-faculties-nxt': {'required': False, 'target-key': 'pa.project.participation.faculties.nxt', 'type': bool},
'pa-participating-faculties-open': {'required': False, 'target-key': 'pa.project.participation.faculties.open', 'type': bool},
# Costs & totals
'pa-cost-{a;1:24}-name': {'required': True, 'target-key': 'pa.project.costs[{a}].name', 'type': str},
'pa-cost-{a;1:24}-amount-euro': {'required': True, 'target-key': 'pa.project.costs[{a}].amountEur', 'type': float},
'pa-requested-amount-euro-sum': {'required': True, 'target-key': 'pa.project.totals.requestedAmountEur', 'type': float},
# Attachments common
'pa-anh-vergleichsangebote': {'required': False, 'target-key': 'pa.attachments.comparativeOffers', 'type': bool},
# Misc
'warning-not-supported': {'required': False, 'target-key': 'warning.notSupported', 'type': str},
}
# --- QSM-specific fields (second variant) ---
TEXT_MAPPING_QSM = {
'pa-qsm-financing': {
'required': True,
'target-key': 'pa.project.financing.qsm.code',
'type': 'enum',
'values': [
('vwv-3-2-1-1', 'Finanzierung zusätzlicher Lehr- und Seminarangebote'),
('vwv-3-2-1-2', 'Fachspezifische Studienprojekte'),
('vwv-3-2-1-3', 'Hochschuldidaktische Fort- und Weiterbildungsmaßnahmen'),
('vwv-3-2-2-1', 'Verbesserung/Ausbau von Serviceeinrichtungen sowie Infrastruktur'),
('vwv-3-2-2-2', 'Lehr- und Lernmaterialien'),
('vwv-3-2-2-3', 'Durchführung von Exkursionen'),
('vwv-3-2-2-4', 'Finanzierung von infrastrukturellen Begleit- und Anpassungsmaßnahmen'),
('vwv-3-2-3-1', 'Verbesserung der Beratungsangebote für Studierende'),
('vwv-3-2-3-2', 'Studium Generale und fachübergreifende Lehrangebote'),
('vwv-3-2-3-3', 'Sonstige Maßnahmen im Interesse der Studierendenschaft'),
]
},
'pa-qsm-stellenfinanzierungen': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.stellenfinanzierungen', 'type': bool},
'pa-qsm-studierende': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.studierende', 'type': bool},
'pa-qsm-individuell': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.individuell', 'type': bool},
'pa-qsm-exkursion-genehmigt': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionGenehmigt', 'type': bool},
'pa-qsm-exkursion-bezuschusst': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionBezuschusst', 'type': bool},
'pa-anh-fakultaet': {'required': False, 'target-key': 'pa.attachments.fakultaet', 'type': bool},
}
# --- VSM-specific fields (first variant; include for completeness) ---
TEXT_MAPPING_VSM = {
'pa-vsm-financing': {
'required': True,
'target-key': 'pa.project.financing.vsm.code',
'type': 'enum',
'values': [
('-', '-'),
('lhg-01', 'Hochschulpolitische, fachliche, soziale, wirtschaftliche und kulturelle Belange'),
('lhg-02', 'Mitwirkung an den Aufgaben der Hochschulen nach den §§ 2 bis 7'),
('lhg-03', 'Politische Bildung'),
('lhg-04', 'Förderung der Chancengleichheit und Abbau von Benachteiligungen'),
('lhg-05', 'Förderung der Integration ausländischer Studierender'),
('lhg-06', 'Förderung der sportlichen Aktivitäten'),
('lhg-07', 'Pflege der überregionalen Studierendenbeziehungen'),
]
},
'pa-vsm-aufgaben': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.aufgaben', 'type': bool},
'pa-vsm-individuell': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.individuell', 'type': bool},
}

485
pdf_to_json.py Executable file
View File

@ -0,0 +1,485 @@
#!/usr/bin/env python3
"""
Extract PDF Form Data and Convert to JSON
This script extracts form data from a PDF file, maps it using the provided
field mappings, and emits a structured JSON payload.
Requires:
- PyPDF2
- pdf_field_mapping.py containing:
TEXT_MAPPING_COMMON, TEXT_MAPPING_QSM, TEXT_MAPPING_VSM, _PLACEHOLDER_VALUES
"""
from __future__ import annotations
import json
import re
from argparse import ArgumentParser
from dataclasses import dataclass, asdict, field
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
import PyPDF2
from pdf_field_mapping import (
TEXT_MAPPING_COMMON,
TEXT_MAPPING_QSM,
TEXT_MAPPING_VSM,
_PLACEHOLDER_VALUES,
)
# =========================
# Types / Data Model
# =========================
@dataclass
class Name:
first: Optional[str] = None
last: Optional[str] = None
@dataclass
class Institution:
name: Optional[str] = None
type: Optional[str] = None # enum key
@dataclass
class Contact:
email: Optional[str] = None
phone: Optional[str] = None
@dataclass
class Applicant:
type: Optional[str] = None # enum key
institution: Institution = field(default_factory=Institution)
name: Name = field(default_factory=Name)
course: Optional[str] = None # enum key
role: Optional[str] = None # enum key
contact: Contact = field(default_factory=Contact)
@dataclass
class Dates:
start: Optional[str] = None
end: Optional[str] = None
@dataclass
class ParticipationFaculties:
inf: Optional[bool] = None
esb: Optional[bool] = None
ls: Optional[bool] = None
tec: Optional[bool] = None
tex: Optional[bool] = None
nxt: Optional[bool] = None
open: Optional[bool] = None
@dataclass
class Participation:
faculties: ParticipationFaculties = field(default_factory=ParticipationFaculties)
@dataclass
class Cost:
name: Optional[str] = None
amountEur: Optional[float] = None
@dataclass
class Totals:
requestedAmountEur: Optional[float] = None
@dataclass
class VSMFlags:
aufgaben: Optional[bool] = None
individuell: Optional[bool] = None
@dataclass
class VSM:
code: Optional[str] = None # enum key
flags: VSMFlags = field(default_factory=VSMFlags)
@dataclass
class QSMFlags:
stellenfinanzierungen: Optional[bool] = None
studierende: Optional[bool] = None
individuell: Optional[bool] = None
exkursionGenehmigt: Optional[bool] = None
exkursionBezuschusst: Optional[bool] = None
@dataclass
class QSM:
code: Optional[str] = None # enum key
flags: QSMFlags = field(default_factory=QSMFlags)
@dataclass
class Financing:
vsm: Optional[VSM] = None
qsm: Optional[QSM] = None
@dataclass
class Project:
name: Optional[str] = None
dates: Dates = field(default_factory=Dates)
participants: Optional[int] = None
description: Optional[str] = None
participation: Participation = field(default_factory=Participation)
costs: List[Cost] = field(default_factory=list)
totals: Totals = field(default_factory=Totals)
financing: Financing = field(default_factory=Financing)
@dataclass
class Attachments:
comparativeOffers: Optional[bool] = None
fakultaet: Optional[bool] = None # only in QSM variant
@dataclass
class WarningInfo:
notSupported: Optional[str] = None
@dataclass
class RootPayload:
pa: Any = field(default_factory=dict) # will hold applicant + project + attachments
warning: WarningInfo = field(default_factory=WarningInfo)
_validation: Dict[str, Any] = field(default_factory=dict)
# =========================
# Mapping helpers
# =========================
def _to_bool(v: Any) -> bool:
if isinstance(v, str):
return v not in ("/Off", "")
return bool(v)
def _to_int(v: Any) -> Optional[int]:
if v in _PLACEHOLDER_VALUES:
return None
try:
return int(str(v).strip())
except Exception:
try:
return int(float(str(v).strip().replace(",", ".").replace(" ", "")))
except Exception:
return None
def _to_float_de(v: Any) -> Optional[float]:
if v in _PLACEHOLDER_VALUES:
return None
try:
s = str(v).replace(".", "").replace(" ", "").replace(",", ".")
return float(s)
except Exception:
return None
def _to_str(v: Any) -> Optional[str]:
if v is None:
return None
s = str(v)
if s.strip() in _PLACEHOLDER_VALUES:
return None
return s
def _from_enum(v: Any, pairs: Iterable[Tuple[str, str]]) -> Optional[str]:
if v in _PLACEHOLDER_VALUES:
return None
s = str(v)
for key, _ in pairs:
if s == key:
return key
for key, label in pairs:
if s == label:
return key
return None
def _coerce(v: Any, spec: Mapping[str, Any]) -> Any:
t = spec.get("type")
if t is bool:
return _to_bool(v)
if t is int:
return _to_int(v)
if t is float:
return _to_float_de(v)
if t is str:
return _to_str(v)
if t == "enum":
return _from_enum(v, spec.get("values", []))
return None if v in _PLACEHOLDER_VALUES else v
_key_index_re = re.compile(r"([^\[\]]+)\[(\d+)\]")
def _set_nested(root: Dict[str, Any], dotted: str, value: Any) -> None:
parts = dotted.split(".")
curr: Any = root
for i, part in enumerate(parts):
m = _key_index_re.fullmatch(part)
is_last = i == len(parts) - 1
if m:
k, idx_str = m.group(1), m.group(2)
idx = int(idx_str)
if k not in curr or not isinstance(curr.get(k), list):
curr[k] = []
lst = curr[k]
while len(lst) <= idx:
lst.append({})
if is_last:
lst[idx] = value
else:
if not isinstance(lst[idx], dict):
lst[idx] = {}
curr = lst[idx]
else:
if is_last:
curr[part] = value
else:
if part not in curr or not isinstance(curr[part], dict):
curr[part] = {}
curr = curr[part]
def _merge_mapping(variant: str) -> Dict[str, Dict[str, Any]]:
v = (variant or "").strip().upper()
if v == "QSM":
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_QSM}
if v == "VSM":
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_VSM}
return dict(TEXT_MAPPING_COMMON)
_cost_name_pat = re.compile(r"^pa-cost-(\d+)-name$")
_cost_amt_pat = re.compile(r"^pa-cost-(\d+)-amount-euro$")
def detect_variant(form_fields: Mapping[str, Any]) -> str:
"""Best-effort variant detection from raw PDF fields."""
keys = set(form_fields.keys())
if "pa-qsm-financing" in keys:
return "QSM"
if "pa-vsm-financing" in keys:
return "VSM"
return "COMMON"
def map_form_to_payload(form_json: Dict[str, Dict[str, Any]], variant: str) -> Dict[str, Any]:
"""
Map PDF-like form JSON (fieldName -> dict with '/V', etc.) to nested payload
using TEXT_MAPPING_* dicts. Unknown/empty fields are skipped.
"""
mapping = _merge_mapping(variant)
out: Dict[str, Any] = {}
# Pre-collect costs
costs_tmp: Dict[int, Dict[str, Any]] = {}
# First pass
for field_name, meta in form_json.items():
raw_val = meta.get("/V")
# Costs pattern
m_name = _cost_name_pat.match(field_name)
m_amt = _cost_amt_pat .match(field_name)
if m_name:
idx = int(m_name.group(1)) # 1..24 -> zero-based
costs_tmp.setdefault(idx - 1, {})["name"] = _to_str(raw_val)
continue
if m_amt:
idx = int(m_amt.group(1))
costs_tmp.setdefault(idx - 1, {})["amountEur"] = _to_float_de(raw_val)
continue
spec = mapping.get(field_name)
if not spec:
continue
coerced = _coerce(raw_val, spec)
include = isinstance(coerced, bool) or coerced not in (None, "", [])
if not include:
continue
target = spec["target-key"]
_set_nested(out, target, coerced)
# Costs into payload (skip empty rows)
for idx, row in sorted(costs_tmp.items()):
if row.get("name") is None and row.get("amountEur") is None:
continue
_set_nested(out, f"pa.project.costs[{idx}]", row)
# Required check
missing = []
for fname, spec in mapping.items():
if not spec.get("required"):
continue
tkey = spec["target-key"]
cursor = out
ok = True
for part in tkey.split("."):
mm = _key_index_re.fullmatch(part)
if mm:
k, idx = mm.group(1), int(mm.group(2))
if k not in cursor or not isinstance(cursor[k], list) or len(cursor[k]) <= idx:
ok = False
break
cursor = cursor[k][idx]
else:
if part not in cursor:
ok = False
break
cursor = cursor[part]
if not ok:
missing.append((fname, tkey))
if missing:
out.setdefault("_validation", {})["missingRequired"] = missing
return out
# =========================
# Builders to dataclasses
# =========================
def _get(d: Mapping[str, Any], path: str, default=None):
curr = d
for part in path.split("."):
if not isinstance(curr, Mapping) or part not in curr:
return default
curr = curr[part]
return curr
def payload_to_model(payload: Dict[str, Any]) -> RootPayload:
# Build Applicant
applicant_dict = _get(payload, "pa.applicant", {}) or {}
applicant = Applicant(
type=applicant_dict.get("type"),
institution=Institution(
name=_get(applicant_dict, "institution.name"),
type=_get(applicant_dict, "institution.type"),
),
name=Name(
first=_get(applicant_dict, "name.first"),
last=_get(applicant_dict, "name.last"),
),
course=applicant_dict.get("course"),
role=applicant_dict.get("role"),
contact=Contact(
email=_get(applicant_dict, "contact.email"),
phone=_get(applicant_dict, "contact.phone"),
),
)
# Project
project_dict = _get(payload, "pa.project", {}) or {}
costs = []
for c in project_dict.get("costs", []) or []:
if not isinstance(c, Mapping):
continue
costs.append(Cost(name=c.get("name"), amountEur=c.get("amountEur")))
# Financing
vsm_dict = _get(project_dict, "financing.vsm", {}) or {}
vsm = None
if vsm_dict:
vsm = VSM(
code=vsm_dict.get("code"),
flags=VSMFlags(
aufgaben=_get(vsm_dict, "flags.aufgaben"),
individuell=_get(vsm_dict, "flags.individuell"),
),
)
qsm_dict = _get(project_dict, "financing.qsm", {}) or {}
qsm = None
if qsm_dict:
qsm = QSM(
code=qsm_dict.get("code"),
flags=QSMFlags(
stellenfinanzierungen=_get(qsm_dict, "flags.stellenfinanzierungen"),
studierende=_get(qsm_dict, "flags.studierende"),
individuell=_get(qsm_dict, "flags.individuell"),
exkursionGenehmigt=_get(qsm_dict, "flags.exkursionGenehmigt"),
exkursionBezuschusst=_get(qsm_dict, "flags.exkursionBezuschusst"),
),
)
project = Project(
name=project_dict.get("name"),
dates=Dates(
start=_get(project_dict, "dates.start"),
end=_get(project_dict, "dates.end"),
),
participants=project_dict.get("participants"),
description=project_dict.get("description"),
participation=Participation(
faculties=ParticipationFaculties(
inf=_get(project_dict, "participation.faculties.inf"),
esb=_get(project_dict, "participation.faculties.esb"),
ls=_get(project_dict, "participation.faculties.ls"),
tec=_get(project_dict, "participation.faculties.tec"),
tex=_get(project_dict, "participation.faculties.tex"),
nxt=_get(project_dict, "participation.faculties.nxt"),
open=_get(project_dict, "participation.faculties.open"),
)
),
costs=costs,
totals=Totals(requestedAmountEur=_get(project_dict, "totals.requestedAmountEur")),
financing=Financing(vsm=vsm, qsm=qsm),
)
# Attachments
attachments_dict = _get(payload, "pa.attachments", {}) or {}
attachments = Attachments(
comparativeOffers=attachments_dict.get("comparativeOffers"),
fakultaet=attachments_dict.get("fakultaet"),
)
# Warning
warning_dict = payload.get("warning", {}) or {}
warning = WarningInfo(notSupported=warning_dict.get("notSupported"))
root = RootPayload(
pa={
"applicant": asdict(applicant),
"project": asdict(project),
"attachments": asdict(attachments),
},
warning=warning,
_validation=payload.get("_validation", {}),
)
return root
# =========================
# PDF reading + end-to-end
# =========================
def read_pdf_fields(pdf_file: str) -> Dict[str, Dict[str, Any]]:
with open(pdf_file, "rb") as f:
reader = PyPDF2.PdfReader(f)
fields = reader.get_fields() or {}
# ensure dict[str, dict] and keep only keys we care about
return {k: (v or {}) for k, v in fields.items()}
def pdf_to_payload(pdf_file: str, variant: Optional[str] = None) -> RootPayload:
"""
Extract, map, and convert to dataclass model.
"""
form_fields = read_pdf_fields(pdf_file)
v = variant or detect_variant(form_fields)
mapped = map_form_to_payload(form_fields, v)
return payload_to_model(mapped)
def pdf_to_json(pdf_file: str, variant: Optional[str] = None) -> str:
"""
Convenience: return the structured JSON string of the mapped payload.
"""
model = pdf_to_payload(pdf_file, variant=variant)
return json.dumps(asdict(model), ensure_ascii=False, indent=2)
# =========================
# CLI
# =========================
if __name__ == "__main__":
parser = ArgumentParser(description="Extract PDF Form Data and Convert to structured JSON")
parser.add_argument("pdf_file", help="Path to the PDF file")
parser.add_argument(
"--variant",
choices=["QSM", "VSM", "COMMON", "AUTO", "auto"],
default="AUTO",
help="Form variant (default: AUTO)",
)
args = parser.parse_args()
v = None if args.variant.upper() == "AUTO" else args.variant.upper()
print(pdf_to_json(args.pdf_file, variant=v))