Working mapper
This commit is contained in:
parent
e1fb0936fc
commit
34589826e5
3
.gitignore
vendored
3
.gitignore
vendored
@ -160,3 +160,6 @@ cython_debug/
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# PDF
|
||||
*.pdf
|
||||
!assets/*.pdf
|
||||
|
||||
133
pdf_field_mapping.py
Normal file
133
pdf_field_mapping.py
Normal file
@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Field Mapping
|
||||
|
||||
This module provides a mapping of PDF fields to their corresponding keys in the application data.
|
||||
"""
|
||||
|
||||
_PLACEHOLDER_VALUES: set = {None, "", "-", "JJJJ-MM", "/\\Fld@default "}
|
||||
|
||||
# --- COMMON fields (shared across variants) ---
|
||||
TEXT_MAPPING_COMMON: dict = {
|
||||
# Applicant
|
||||
'pa-applicant-type': {
|
||||
'required': True,
|
||||
'target-key': 'pa.applicant.type',
|
||||
'type': 'enum',
|
||||
'values': [('person', 'Person'), ('institution', 'Institution')]
|
||||
},
|
||||
'pa-institution-type': {
|
||||
'required': True,
|
||||
'target-key': 'pa.applicant.institution.type',
|
||||
'type': 'enum',
|
||||
'values': [
|
||||
('-', '-'),
|
||||
('stud-fs', 'Fachschaft'),
|
||||
('stud-rf', 'STUPA-Referat'),
|
||||
('stud-hg', 'Studentische Hochschulgruppe'),
|
||||
('faculty', 'Fakultät'),
|
||||
('hs-institution', 'Hochschuleinrichtung'),
|
||||
]
|
||||
},
|
||||
'pa-institution': {'required': True, 'target-key': 'pa.applicant.institution.name', 'type': str},
|
||||
'pa-first-name': {'required': True, 'target-key': 'pa.applicant.name.first'},
|
||||
'pa-last-name': {'required': True, 'target-key': 'pa.applicant.name.last'},
|
||||
'pa-email': {'required': True, 'target-key': 'pa.applicant.contact.email'},
|
||||
'pa-phone': {'required': False, 'target-key': 'pa.applicant.contact.phone'},
|
||||
|
||||
'pa-course': {
|
||||
'required': True,
|
||||
'target-key': 'pa.applicant.course',
|
||||
'type': 'enum',
|
||||
'values': [('-', '-'), ('INF', 'INF'), ('ESB', 'ESB'), ('LS', 'LS'), ('TEC', 'TEC'), ('TEX', 'TEX'), ('NXT', 'NXT')]
|
||||
},
|
||||
'pa-role': {
|
||||
'required': True,
|
||||
'target-key': 'pa.applicant.role',
|
||||
'type': 'enum',
|
||||
'values': [
|
||||
('-', '-'),
|
||||
('Student', 'Student'),
|
||||
('Professor', 'Professor'),
|
||||
('Mitarbeiter', 'Mitarbeiter'),
|
||||
('ASTA', 'ASTA'),
|
||||
('Referatsleitung', 'Referatsleitung'),
|
||||
('Fachschaftsvorstand', 'Fachschaftsvorstand'),
|
||||
]
|
||||
},
|
||||
|
||||
# Project core
|
||||
'pa-project-name': {'required': True, 'target-key': 'pa.project.name', 'type': str},
|
||||
'pa-start-date': {'required': True, 'target-key': 'pa.project.dates.start', 'type': str},
|
||||
'pa-end-date': {'required': False, 'target-key': 'pa.project.dates.end', 'type': str},
|
||||
'pa-participants': {'required': False, 'target-key': 'pa.project.participants', 'type': int},
|
||||
'pa-project-description': {'required': True, 'target-key': 'pa.project.description', 'type': str},
|
||||
|
||||
# Participation (checkboxes)
|
||||
'pa-participating-faculties-inf': {'required': False, 'target-key': 'pa.project.participation.faculties.inf', 'type': bool},
|
||||
'pa-participating-faculties-esb': {'required': False, 'target-key': 'pa.project.participation.faculties.esb', 'type': bool},
|
||||
'pa-participating-faculties-ls': {'required': False, 'target-key': 'pa.project.participation.faculties.ls', 'type': bool},
|
||||
'pa-participating-faculties-tec': {'required': False, 'target-key': 'pa.project.participation.faculties.tec', 'type': bool},
|
||||
'pa-participating-faculties-tex': {'required': False, 'target-key': 'pa.project.participation.faculties.tex', 'type': bool},
|
||||
'pa-participating-faculties-nxt': {'required': False, 'target-key': 'pa.project.participation.faculties.nxt', 'type': bool},
|
||||
'pa-participating-faculties-open': {'required': False, 'target-key': 'pa.project.participation.faculties.open', 'type': bool},
|
||||
|
||||
# Costs & totals
|
||||
'pa-cost-{a;1:24}-name': {'required': True, 'target-key': 'pa.project.costs[{a}].name', 'type': str},
|
||||
'pa-cost-{a;1:24}-amount-euro': {'required': True, 'target-key': 'pa.project.costs[{a}].amountEur', 'type': float},
|
||||
'pa-requested-amount-euro-sum': {'required': True, 'target-key': 'pa.project.totals.requestedAmountEur', 'type': float},
|
||||
|
||||
# Attachments common
|
||||
'pa-anh-vergleichsangebote': {'required': False, 'target-key': 'pa.attachments.comparativeOffers', 'type': bool},
|
||||
|
||||
# Misc
|
||||
'warning-not-supported': {'required': False, 'target-key': 'warning.notSupported', 'type': str},
|
||||
}
|
||||
|
||||
# --- QSM-specific fields (second variant) ---
|
||||
TEXT_MAPPING_QSM = {
|
||||
'pa-qsm-financing': {
|
||||
'required': True,
|
||||
'target-key': 'pa.project.financing.qsm.code',
|
||||
'type': 'enum',
|
||||
'values': [
|
||||
('vwv-3-2-1-1', 'Finanzierung zusätzlicher Lehr- und Seminarangebote'),
|
||||
('vwv-3-2-1-2', 'Fachspezifische Studienprojekte'),
|
||||
('vwv-3-2-1-3', 'Hochschuldidaktische Fort- und Weiterbildungsmaßnahmen'),
|
||||
('vwv-3-2-2-1', 'Verbesserung/Ausbau von Serviceeinrichtungen sowie Infrastruktur'),
|
||||
('vwv-3-2-2-2', 'Lehr- und Lernmaterialien'),
|
||||
('vwv-3-2-2-3', 'Durchführung von Exkursionen'),
|
||||
('vwv-3-2-2-4', 'Finanzierung von infrastrukturellen Begleit- und Anpassungsmaßnahmen'),
|
||||
('vwv-3-2-3-1', 'Verbesserung der Beratungsangebote für Studierende'),
|
||||
('vwv-3-2-3-2', 'Studium Generale und fachübergreifende Lehrangebote'),
|
||||
('vwv-3-2-3-3', 'Sonstige Maßnahmen im Interesse der Studierendenschaft'),
|
||||
]
|
||||
},
|
||||
'pa-qsm-stellenfinanzierungen': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.stellenfinanzierungen', 'type': bool},
|
||||
'pa-qsm-studierende': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.studierende', 'type': bool},
|
||||
'pa-qsm-individuell': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.individuell', 'type': bool},
|
||||
'pa-qsm-exkursion-genehmigt': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionGenehmigt', 'type': bool},
|
||||
'pa-qsm-exkursion-bezuschusst': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionBezuschusst', 'type': bool},
|
||||
'pa-anh-fakultaet': {'required': False, 'target-key': 'pa.attachments.fakultaet', 'type': bool},
|
||||
}
|
||||
|
||||
# --- VSM-specific fields (first variant; include for completeness) ---
|
||||
TEXT_MAPPING_VSM = {
|
||||
'pa-vsm-financing': {
|
||||
'required': True,
|
||||
'target-key': 'pa.project.financing.vsm.code',
|
||||
'type': 'enum',
|
||||
'values': [
|
||||
('-', '-'),
|
||||
('lhg-01', 'Hochschulpolitische, fachliche, soziale, wirtschaftliche und kulturelle Belange'),
|
||||
('lhg-02', 'Mitwirkung an den Aufgaben der Hochschulen nach den §§ 2 bis 7'),
|
||||
('lhg-03', 'Politische Bildung'),
|
||||
('lhg-04', 'Förderung der Chancengleichheit und Abbau von Benachteiligungen'),
|
||||
('lhg-05', 'Förderung der Integration ausländischer Studierender'),
|
||||
('lhg-06', 'Förderung der sportlichen Aktivitäten'),
|
||||
('lhg-07', 'Pflege der überregionalen Studierendenbeziehungen'),
|
||||
]
|
||||
},
|
||||
'pa-vsm-aufgaben': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.aufgaben', 'type': bool},
|
||||
'pa-vsm-individuell': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.individuell', 'type': bool},
|
||||
}
|
||||
485
pdf_to_json.py
Executable file
485
pdf_to_json.py
Executable file
@ -0,0 +1,485 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract PDF Form Data and Convert to JSON
|
||||
|
||||
This script extracts form data from a PDF file, maps it using the provided
|
||||
field mappings, and emits a structured JSON payload.
|
||||
|
||||
Requires:
|
||||
- PyPDF2
|
||||
- pdf_field_mapping.py containing:
|
||||
TEXT_MAPPING_COMMON, TEXT_MAPPING_QSM, TEXT_MAPPING_VSM, _PLACEHOLDER_VALUES
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from argparse import ArgumentParser
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
||||
|
||||
import PyPDF2
|
||||
from pdf_field_mapping import (
|
||||
TEXT_MAPPING_COMMON,
|
||||
TEXT_MAPPING_QSM,
|
||||
TEXT_MAPPING_VSM,
|
||||
_PLACEHOLDER_VALUES,
|
||||
)
|
||||
|
||||
# =========================
|
||||
# Types / Data Model
|
||||
# =========================
|
||||
|
||||
@dataclass
|
||||
class Name:
|
||||
first: Optional[str] = None
|
||||
last: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class Institution:
|
||||
name: Optional[str] = None
|
||||
type: Optional[str] = None # enum key
|
||||
|
||||
@dataclass
|
||||
class Contact:
|
||||
email: Optional[str] = None
|
||||
phone: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class Applicant:
|
||||
type: Optional[str] = None # enum key
|
||||
institution: Institution = field(default_factory=Institution)
|
||||
name: Name = field(default_factory=Name)
|
||||
course: Optional[str] = None # enum key
|
||||
role: Optional[str] = None # enum key
|
||||
contact: Contact = field(default_factory=Contact)
|
||||
|
||||
@dataclass
|
||||
class Dates:
|
||||
start: Optional[str] = None
|
||||
end: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class ParticipationFaculties:
|
||||
inf: Optional[bool] = None
|
||||
esb: Optional[bool] = None
|
||||
ls: Optional[bool] = None
|
||||
tec: Optional[bool] = None
|
||||
tex: Optional[bool] = None
|
||||
nxt: Optional[bool] = None
|
||||
open: Optional[bool] = None
|
||||
|
||||
@dataclass
|
||||
class Participation:
|
||||
faculties: ParticipationFaculties = field(default_factory=ParticipationFaculties)
|
||||
|
||||
@dataclass
|
||||
class Cost:
|
||||
name: Optional[str] = None
|
||||
amountEur: Optional[float] = None
|
||||
|
||||
@dataclass
|
||||
class Totals:
|
||||
requestedAmountEur: Optional[float] = None
|
||||
|
||||
@dataclass
|
||||
class VSMFlags:
|
||||
aufgaben: Optional[bool] = None
|
||||
individuell: Optional[bool] = None
|
||||
|
||||
@dataclass
|
||||
class VSM:
|
||||
code: Optional[str] = None # enum key
|
||||
flags: VSMFlags = field(default_factory=VSMFlags)
|
||||
|
||||
@dataclass
|
||||
class QSMFlags:
|
||||
stellenfinanzierungen: Optional[bool] = None
|
||||
studierende: Optional[bool] = None
|
||||
individuell: Optional[bool] = None
|
||||
exkursionGenehmigt: Optional[bool] = None
|
||||
exkursionBezuschusst: Optional[bool] = None
|
||||
|
||||
@dataclass
|
||||
class QSM:
|
||||
code: Optional[str] = None # enum key
|
||||
flags: QSMFlags = field(default_factory=QSMFlags)
|
||||
|
||||
@dataclass
|
||||
class Financing:
|
||||
vsm: Optional[VSM] = None
|
||||
qsm: Optional[QSM] = None
|
||||
|
||||
@dataclass
|
||||
class Project:
|
||||
name: Optional[str] = None
|
||||
dates: Dates = field(default_factory=Dates)
|
||||
participants: Optional[int] = None
|
||||
description: Optional[str] = None
|
||||
participation: Participation = field(default_factory=Participation)
|
||||
costs: List[Cost] = field(default_factory=list)
|
||||
totals: Totals = field(default_factory=Totals)
|
||||
financing: Financing = field(default_factory=Financing)
|
||||
|
||||
@dataclass
|
||||
class Attachments:
|
||||
comparativeOffers: Optional[bool] = None
|
||||
fakultaet: Optional[bool] = None # only in QSM variant
|
||||
|
||||
@dataclass
|
||||
class WarningInfo:
|
||||
notSupported: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class RootPayload:
|
||||
pa: Any = field(default_factory=dict) # will hold applicant + project + attachments
|
||||
warning: WarningInfo = field(default_factory=WarningInfo)
|
||||
_validation: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
# =========================
|
||||
# Mapping helpers
|
||||
# =========================
|
||||
|
||||
def _to_bool(v: Any) -> bool:
|
||||
if isinstance(v, str):
|
||||
return v not in ("/Off", "")
|
||||
return bool(v)
|
||||
|
||||
def _to_int(v: Any) -> Optional[int]:
|
||||
if v in _PLACEHOLDER_VALUES:
|
||||
return None
|
||||
try:
|
||||
return int(str(v).strip())
|
||||
except Exception:
|
||||
try:
|
||||
return int(float(str(v).strip().replace(",", ".").replace(" ", "")))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _to_float_de(v: Any) -> Optional[float]:
|
||||
if v in _PLACEHOLDER_VALUES:
|
||||
return None
|
||||
try:
|
||||
s = str(v).replace(".", "").replace(" ", "").replace(",", ".")
|
||||
return float(s)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _to_str(v: Any) -> Optional[str]:
|
||||
if v is None:
|
||||
return None
|
||||
s = str(v)
|
||||
if s.strip() in _PLACEHOLDER_VALUES:
|
||||
return None
|
||||
return s
|
||||
|
||||
def _from_enum(v: Any, pairs: Iterable[Tuple[str, str]]) -> Optional[str]:
|
||||
if v in _PLACEHOLDER_VALUES:
|
||||
return None
|
||||
s = str(v)
|
||||
for key, _ in pairs:
|
||||
if s == key:
|
||||
return key
|
||||
for key, label in pairs:
|
||||
if s == label:
|
||||
return key
|
||||
return None
|
||||
|
||||
def _coerce(v: Any, spec: Mapping[str, Any]) -> Any:
|
||||
t = spec.get("type")
|
||||
if t is bool:
|
||||
return _to_bool(v)
|
||||
if t is int:
|
||||
return _to_int(v)
|
||||
if t is float:
|
||||
return _to_float_de(v)
|
||||
if t is str:
|
||||
return _to_str(v)
|
||||
if t == "enum":
|
||||
return _from_enum(v, spec.get("values", []))
|
||||
return None if v in _PLACEHOLDER_VALUES else v
|
||||
|
||||
_key_index_re = re.compile(r"([^\[\]]+)\[(\d+)\]")
|
||||
|
||||
def _set_nested(root: Dict[str, Any], dotted: str, value: Any) -> None:
|
||||
parts = dotted.split(".")
|
||||
curr: Any = root
|
||||
for i, part in enumerate(parts):
|
||||
m = _key_index_re.fullmatch(part)
|
||||
is_last = i == len(parts) - 1
|
||||
if m:
|
||||
k, idx_str = m.group(1), m.group(2)
|
||||
idx = int(idx_str)
|
||||
if k not in curr or not isinstance(curr.get(k), list):
|
||||
curr[k] = []
|
||||
lst = curr[k]
|
||||
while len(lst) <= idx:
|
||||
lst.append({})
|
||||
if is_last:
|
||||
lst[idx] = value
|
||||
else:
|
||||
if not isinstance(lst[idx], dict):
|
||||
lst[idx] = {}
|
||||
curr = lst[idx]
|
||||
else:
|
||||
if is_last:
|
||||
curr[part] = value
|
||||
else:
|
||||
if part not in curr or not isinstance(curr[part], dict):
|
||||
curr[part] = {}
|
||||
curr = curr[part]
|
||||
|
||||
def _merge_mapping(variant: str) -> Dict[str, Dict[str, Any]]:
|
||||
v = (variant or "").strip().upper()
|
||||
if v == "QSM":
|
||||
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_QSM}
|
||||
if v == "VSM":
|
||||
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_VSM}
|
||||
return dict(TEXT_MAPPING_COMMON)
|
||||
|
||||
_cost_name_pat = re.compile(r"^pa-cost-(\d+)-name$")
|
||||
_cost_amt_pat = re.compile(r"^pa-cost-(\d+)-amount-euro$")
|
||||
|
||||
def detect_variant(form_fields: Mapping[str, Any]) -> str:
|
||||
"""Best-effort variant detection from raw PDF fields."""
|
||||
keys = set(form_fields.keys())
|
||||
if "pa-qsm-financing" in keys:
|
||||
return "QSM"
|
||||
if "pa-vsm-financing" in keys:
|
||||
return "VSM"
|
||||
return "COMMON"
|
||||
|
||||
|
||||
def map_form_to_payload(form_json: Dict[str, Dict[str, Any]], variant: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Map PDF-like form JSON (fieldName -> dict with '/V', etc.) to nested payload
|
||||
using TEXT_MAPPING_* dicts. Unknown/empty fields are skipped.
|
||||
"""
|
||||
mapping = _merge_mapping(variant)
|
||||
out: Dict[str, Any] = {}
|
||||
|
||||
# Pre-collect costs
|
||||
costs_tmp: Dict[int, Dict[str, Any]] = {}
|
||||
|
||||
# First pass
|
||||
for field_name, meta in form_json.items():
|
||||
raw_val = meta.get("/V")
|
||||
|
||||
# Costs pattern
|
||||
m_name = _cost_name_pat.match(field_name)
|
||||
m_amt = _cost_amt_pat .match(field_name)
|
||||
|
||||
if m_name:
|
||||
idx = int(m_name.group(1)) # 1..24 -> zero-based
|
||||
costs_tmp.setdefault(idx - 1, {})["name"] = _to_str(raw_val)
|
||||
continue
|
||||
if m_amt:
|
||||
idx = int(m_amt.group(1))
|
||||
costs_tmp.setdefault(idx - 1, {})["amountEur"] = _to_float_de(raw_val)
|
||||
continue
|
||||
|
||||
spec = mapping.get(field_name)
|
||||
if not spec:
|
||||
continue
|
||||
coerced = _coerce(raw_val, spec)
|
||||
include = isinstance(coerced, bool) or coerced not in (None, "", [])
|
||||
if not include:
|
||||
continue
|
||||
target = spec["target-key"]
|
||||
_set_nested(out, target, coerced)
|
||||
|
||||
# Costs into payload (skip empty rows)
|
||||
for idx, row in sorted(costs_tmp.items()):
|
||||
if row.get("name") is None and row.get("amountEur") is None:
|
||||
continue
|
||||
_set_nested(out, f"pa.project.costs[{idx}]", row)
|
||||
|
||||
# Required check
|
||||
missing = []
|
||||
for fname, spec in mapping.items():
|
||||
if not spec.get("required"):
|
||||
continue
|
||||
tkey = spec["target-key"]
|
||||
cursor = out
|
||||
ok = True
|
||||
for part in tkey.split("."):
|
||||
mm = _key_index_re.fullmatch(part)
|
||||
if mm:
|
||||
k, idx = mm.group(1), int(mm.group(2))
|
||||
if k not in cursor or not isinstance(cursor[k], list) or len(cursor[k]) <= idx:
|
||||
ok = False
|
||||
break
|
||||
cursor = cursor[k][idx]
|
||||
else:
|
||||
if part not in cursor:
|
||||
ok = False
|
||||
break
|
||||
cursor = cursor[part]
|
||||
if not ok:
|
||||
missing.append((fname, tkey))
|
||||
if missing:
|
||||
out.setdefault("_validation", {})["missingRequired"] = missing
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# =========================
|
||||
# Builders to dataclasses
|
||||
# =========================
|
||||
|
||||
def _get(d: Mapping[str, Any], path: str, default=None):
|
||||
curr = d
|
||||
for part in path.split("."):
|
||||
if not isinstance(curr, Mapping) or part not in curr:
|
||||
return default
|
||||
curr = curr[part]
|
||||
return curr
|
||||
|
||||
def payload_to_model(payload: Dict[str, Any]) -> RootPayload:
|
||||
# Build Applicant
|
||||
applicant_dict = _get(payload, "pa.applicant", {}) or {}
|
||||
applicant = Applicant(
|
||||
type=applicant_dict.get("type"),
|
||||
institution=Institution(
|
||||
name=_get(applicant_dict, "institution.name"),
|
||||
type=_get(applicant_dict, "institution.type"),
|
||||
),
|
||||
name=Name(
|
||||
first=_get(applicant_dict, "name.first"),
|
||||
last=_get(applicant_dict, "name.last"),
|
||||
),
|
||||
course=applicant_dict.get("course"),
|
||||
role=applicant_dict.get("role"),
|
||||
contact=Contact(
|
||||
email=_get(applicant_dict, "contact.email"),
|
||||
phone=_get(applicant_dict, "contact.phone"),
|
||||
),
|
||||
)
|
||||
|
||||
# Project
|
||||
project_dict = _get(payload, "pa.project", {}) or {}
|
||||
costs = []
|
||||
for c in project_dict.get("costs", []) or []:
|
||||
if not isinstance(c, Mapping):
|
||||
continue
|
||||
costs.append(Cost(name=c.get("name"), amountEur=c.get("amountEur")))
|
||||
|
||||
# Financing
|
||||
vsm_dict = _get(project_dict, "financing.vsm", {}) or {}
|
||||
vsm = None
|
||||
if vsm_dict:
|
||||
vsm = VSM(
|
||||
code=vsm_dict.get("code"),
|
||||
flags=VSMFlags(
|
||||
aufgaben=_get(vsm_dict, "flags.aufgaben"),
|
||||
individuell=_get(vsm_dict, "flags.individuell"),
|
||||
),
|
||||
)
|
||||
qsm_dict = _get(project_dict, "financing.qsm", {}) or {}
|
||||
qsm = None
|
||||
if qsm_dict:
|
||||
qsm = QSM(
|
||||
code=qsm_dict.get("code"),
|
||||
flags=QSMFlags(
|
||||
stellenfinanzierungen=_get(qsm_dict, "flags.stellenfinanzierungen"),
|
||||
studierende=_get(qsm_dict, "flags.studierende"),
|
||||
individuell=_get(qsm_dict, "flags.individuell"),
|
||||
exkursionGenehmigt=_get(qsm_dict, "flags.exkursionGenehmigt"),
|
||||
exkursionBezuschusst=_get(qsm_dict, "flags.exkursionBezuschusst"),
|
||||
),
|
||||
)
|
||||
|
||||
project = Project(
|
||||
name=project_dict.get("name"),
|
||||
dates=Dates(
|
||||
start=_get(project_dict, "dates.start"),
|
||||
end=_get(project_dict, "dates.end"),
|
||||
),
|
||||
participants=project_dict.get("participants"),
|
||||
description=project_dict.get("description"),
|
||||
participation=Participation(
|
||||
faculties=ParticipationFaculties(
|
||||
inf=_get(project_dict, "participation.faculties.inf"),
|
||||
esb=_get(project_dict, "participation.faculties.esb"),
|
||||
ls=_get(project_dict, "participation.faculties.ls"),
|
||||
tec=_get(project_dict, "participation.faculties.tec"),
|
||||
tex=_get(project_dict, "participation.faculties.tex"),
|
||||
nxt=_get(project_dict, "participation.faculties.nxt"),
|
||||
open=_get(project_dict, "participation.faculties.open"),
|
||||
)
|
||||
),
|
||||
costs=costs,
|
||||
totals=Totals(requestedAmountEur=_get(project_dict, "totals.requestedAmountEur")),
|
||||
financing=Financing(vsm=vsm, qsm=qsm),
|
||||
)
|
||||
|
||||
# Attachments
|
||||
attachments_dict = _get(payload, "pa.attachments", {}) or {}
|
||||
attachments = Attachments(
|
||||
comparativeOffers=attachments_dict.get("comparativeOffers"),
|
||||
fakultaet=attachments_dict.get("fakultaet"),
|
||||
)
|
||||
|
||||
# Warning
|
||||
warning_dict = payload.get("warning", {}) or {}
|
||||
warning = WarningInfo(notSupported=warning_dict.get("notSupported"))
|
||||
|
||||
root = RootPayload(
|
||||
pa={
|
||||
"applicant": asdict(applicant),
|
||||
"project": asdict(project),
|
||||
"attachments": asdict(attachments),
|
||||
},
|
||||
warning=warning,
|
||||
_validation=payload.get("_validation", {}),
|
||||
)
|
||||
return root
|
||||
|
||||
|
||||
# =========================
|
||||
# PDF reading + end-to-end
|
||||
# =========================
|
||||
|
||||
def read_pdf_fields(pdf_file: str) -> Dict[str, Dict[str, Any]]:
|
||||
with open(pdf_file, "rb") as f:
|
||||
reader = PyPDF2.PdfReader(f)
|
||||
fields = reader.get_fields() or {}
|
||||
# ensure dict[str, dict] and keep only keys we care about
|
||||
return {k: (v or {}) for k, v in fields.items()}
|
||||
|
||||
def pdf_to_payload(pdf_file: str, variant: Optional[str] = None) -> RootPayload:
|
||||
"""
|
||||
Extract, map, and convert to dataclass model.
|
||||
"""
|
||||
form_fields = read_pdf_fields(pdf_file)
|
||||
v = variant or detect_variant(form_fields)
|
||||
mapped = map_form_to_payload(form_fields, v)
|
||||
return payload_to_model(mapped)
|
||||
|
||||
def pdf_to_json(pdf_file: str, variant: Optional[str] = None) -> str:
|
||||
"""
|
||||
Convenience: return the structured JSON string of the mapped payload.
|
||||
"""
|
||||
model = pdf_to_payload(pdf_file, variant=variant)
|
||||
return json.dumps(asdict(model), ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
# =========================
|
||||
# CLI
|
||||
# =========================
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser(description="Extract PDF Form Data and Convert to structured JSON")
|
||||
parser.add_argument("pdf_file", help="Path to the PDF file")
|
||||
parser.add_argument(
|
||||
"--variant",
|
||||
choices=["QSM", "VSM", "COMMON", "AUTO", "auto"],
|
||||
default="AUTO",
|
||||
help="Form variant (default: AUTO)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
v = None if args.variant.upper() == "AUTO" else args.variant.upper()
|
||||
print(pdf_to_json(args.pdf_file, variant=v))
|
||||
Loading…
Reference in New Issue
Block a user