Working mapper
This commit is contained in:
parent
e1fb0936fc
commit
34589826e5
3
.gitignore
vendored
3
.gitignore
vendored
@ -160,3 +160,6 @@ cython_debug/
|
|||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
# PDF
|
||||||
|
*.pdf
|
||||||
|
!assets/*.pdf
|
||||||
|
|||||||
133
pdf_field_mapping.py
Normal file
133
pdf_field_mapping.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
PDF Field Mapping
|
||||||
|
|
||||||
|
This module provides a mapping of PDF fields to their corresponding keys in the application data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_PLACEHOLDER_VALUES: set = {None, "", "-", "JJJJ-MM", "/\\Fld@default "}
|
||||||
|
|
||||||
|
# --- COMMON fields (shared across variants) ---
|
||||||
|
TEXT_MAPPING_COMMON: dict = {
|
||||||
|
# Applicant
|
||||||
|
'pa-applicant-type': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.applicant.type',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [('person', 'Person'), ('institution', 'Institution')]
|
||||||
|
},
|
||||||
|
'pa-institution-type': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.applicant.institution.type',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [
|
||||||
|
('-', '-'),
|
||||||
|
('stud-fs', 'Fachschaft'),
|
||||||
|
('stud-rf', 'STUPA-Referat'),
|
||||||
|
('stud-hg', 'Studentische Hochschulgruppe'),
|
||||||
|
('faculty', 'Fakultät'),
|
||||||
|
('hs-institution', 'Hochschuleinrichtung'),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'pa-institution': {'required': True, 'target-key': 'pa.applicant.institution.name', 'type': str},
|
||||||
|
'pa-first-name': {'required': True, 'target-key': 'pa.applicant.name.first'},
|
||||||
|
'pa-last-name': {'required': True, 'target-key': 'pa.applicant.name.last'},
|
||||||
|
'pa-email': {'required': True, 'target-key': 'pa.applicant.contact.email'},
|
||||||
|
'pa-phone': {'required': False, 'target-key': 'pa.applicant.contact.phone'},
|
||||||
|
|
||||||
|
'pa-course': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.applicant.course',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [('-', '-'), ('INF', 'INF'), ('ESB', 'ESB'), ('LS', 'LS'), ('TEC', 'TEC'), ('TEX', 'TEX'), ('NXT', 'NXT')]
|
||||||
|
},
|
||||||
|
'pa-role': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.applicant.role',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [
|
||||||
|
('-', '-'),
|
||||||
|
('Student', 'Student'),
|
||||||
|
('Professor', 'Professor'),
|
||||||
|
('Mitarbeiter', 'Mitarbeiter'),
|
||||||
|
('ASTA', 'ASTA'),
|
||||||
|
('Referatsleitung', 'Referatsleitung'),
|
||||||
|
('Fachschaftsvorstand', 'Fachschaftsvorstand'),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
# Project core
|
||||||
|
'pa-project-name': {'required': True, 'target-key': 'pa.project.name', 'type': str},
|
||||||
|
'pa-start-date': {'required': True, 'target-key': 'pa.project.dates.start', 'type': str},
|
||||||
|
'pa-end-date': {'required': False, 'target-key': 'pa.project.dates.end', 'type': str},
|
||||||
|
'pa-participants': {'required': False, 'target-key': 'pa.project.participants', 'type': int},
|
||||||
|
'pa-project-description': {'required': True, 'target-key': 'pa.project.description', 'type': str},
|
||||||
|
|
||||||
|
# Participation (checkboxes)
|
||||||
|
'pa-participating-faculties-inf': {'required': False, 'target-key': 'pa.project.participation.faculties.inf', 'type': bool},
|
||||||
|
'pa-participating-faculties-esb': {'required': False, 'target-key': 'pa.project.participation.faculties.esb', 'type': bool},
|
||||||
|
'pa-participating-faculties-ls': {'required': False, 'target-key': 'pa.project.participation.faculties.ls', 'type': bool},
|
||||||
|
'pa-participating-faculties-tec': {'required': False, 'target-key': 'pa.project.participation.faculties.tec', 'type': bool},
|
||||||
|
'pa-participating-faculties-tex': {'required': False, 'target-key': 'pa.project.participation.faculties.tex', 'type': bool},
|
||||||
|
'pa-participating-faculties-nxt': {'required': False, 'target-key': 'pa.project.participation.faculties.nxt', 'type': bool},
|
||||||
|
'pa-participating-faculties-open': {'required': False, 'target-key': 'pa.project.participation.faculties.open', 'type': bool},
|
||||||
|
|
||||||
|
# Costs & totals
|
||||||
|
'pa-cost-{a;1:24}-name': {'required': True, 'target-key': 'pa.project.costs[{a}].name', 'type': str},
|
||||||
|
'pa-cost-{a;1:24}-amount-euro': {'required': True, 'target-key': 'pa.project.costs[{a}].amountEur', 'type': float},
|
||||||
|
'pa-requested-amount-euro-sum': {'required': True, 'target-key': 'pa.project.totals.requestedAmountEur', 'type': float},
|
||||||
|
|
||||||
|
# Attachments common
|
||||||
|
'pa-anh-vergleichsangebote': {'required': False, 'target-key': 'pa.attachments.comparativeOffers', 'type': bool},
|
||||||
|
|
||||||
|
# Misc
|
||||||
|
'warning-not-supported': {'required': False, 'target-key': 'warning.notSupported', 'type': str},
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- QSM-specific fields (second variant) ---
|
||||||
|
TEXT_MAPPING_QSM = {
|
||||||
|
'pa-qsm-financing': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.project.financing.qsm.code',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [
|
||||||
|
('vwv-3-2-1-1', 'Finanzierung zusätzlicher Lehr- und Seminarangebote'),
|
||||||
|
('vwv-3-2-1-2', 'Fachspezifische Studienprojekte'),
|
||||||
|
('vwv-3-2-1-3', 'Hochschuldidaktische Fort- und Weiterbildungsmaßnahmen'),
|
||||||
|
('vwv-3-2-2-1', 'Verbesserung/Ausbau von Serviceeinrichtungen sowie Infrastruktur'),
|
||||||
|
('vwv-3-2-2-2', 'Lehr- und Lernmaterialien'),
|
||||||
|
('vwv-3-2-2-3', 'Durchführung von Exkursionen'),
|
||||||
|
('vwv-3-2-2-4', 'Finanzierung von infrastrukturellen Begleit- und Anpassungsmaßnahmen'),
|
||||||
|
('vwv-3-2-3-1', 'Verbesserung der Beratungsangebote für Studierende'),
|
||||||
|
('vwv-3-2-3-2', 'Studium Generale und fachübergreifende Lehrangebote'),
|
||||||
|
('vwv-3-2-3-3', 'Sonstige Maßnahmen im Interesse der Studierendenschaft'),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'pa-qsm-stellenfinanzierungen': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.stellenfinanzierungen', 'type': bool},
|
||||||
|
'pa-qsm-studierende': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.studierende', 'type': bool},
|
||||||
|
'pa-qsm-individuell': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.individuell', 'type': bool},
|
||||||
|
'pa-qsm-exkursion-genehmigt': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionGenehmigt', 'type': bool},
|
||||||
|
'pa-qsm-exkursion-bezuschusst': {'required': False, 'target-key': 'pa.project.financing.qsm.flags.exkursionBezuschusst', 'type': bool},
|
||||||
|
'pa-anh-fakultaet': {'required': False, 'target-key': 'pa.attachments.fakultaet', 'type': bool},
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- VSM-specific fields (first variant; include for completeness) ---
|
||||||
|
TEXT_MAPPING_VSM = {
|
||||||
|
'pa-vsm-financing': {
|
||||||
|
'required': True,
|
||||||
|
'target-key': 'pa.project.financing.vsm.code',
|
||||||
|
'type': 'enum',
|
||||||
|
'values': [
|
||||||
|
('-', '-'),
|
||||||
|
('lhg-01', 'Hochschulpolitische, fachliche, soziale, wirtschaftliche und kulturelle Belange'),
|
||||||
|
('lhg-02', 'Mitwirkung an den Aufgaben der Hochschulen nach den §§ 2 bis 7'),
|
||||||
|
('lhg-03', 'Politische Bildung'),
|
||||||
|
('lhg-04', 'Förderung der Chancengleichheit und Abbau von Benachteiligungen'),
|
||||||
|
('lhg-05', 'Förderung der Integration ausländischer Studierender'),
|
||||||
|
('lhg-06', 'Förderung der sportlichen Aktivitäten'),
|
||||||
|
('lhg-07', 'Pflege der überregionalen Studierendenbeziehungen'),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'pa-vsm-aufgaben': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.aufgaben', 'type': bool},
|
||||||
|
'pa-vsm-individuell': {'required': False, 'target-key': 'pa.project.financing.vsm.flags.individuell', 'type': bool},
|
||||||
|
}
|
||||||
485
pdf_to_json.py
Executable file
485
pdf_to_json.py
Executable file
@ -0,0 +1,485 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Extract PDF Form Data and Convert to JSON
|
||||||
|
|
||||||
|
This script extracts form data from a PDF file, maps it using the provided
|
||||||
|
field mappings, and emits a structured JSON payload.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- PyPDF2
|
||||||
|
- pdf_field_mapping.py containing:
|
||||||
|
TEXT_MAPPING_COMMON, TEXT_MAPPING_QSM, TEXT_MAPPING_VSM, _PLACEHOLDER_VALUES
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import PyPDF2
|
||||||
|
from pdf_field_mapping import (
|
||||||
|
TEXT_MAPPING_COMMON,
|
||||||
|
TEXT_MAPPING_QSM,
|
||||||
|
TEXT_MAPPING_VSM,
|
||||||
|
_PLACEHOLDER_VALUES,
|
||||||
|
)
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# Types / Data Model
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Name:
|
||||||
|
first: Optional[str] = None
|
||||||
|
last: Optional[str] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Institution:
|
||||||
|
name: Optional[str] = None
|
||||||
|
type: Optional[str] = None # enum key
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Contact:
|
||||||
|
email: Optional[str] = None
|
||||||
|
phone: Optional[str] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Applicant:
|
||||||
|
type: Optional[str] = None # enum key
|
||||||
|
institution: Institution = field(default_factory=Institution)
|
||||||
|
name: Name = field(default_factory=Name)
|
||||||
|
course: Optional[str] = None # enum key
|
||||||
|
role: Optional[str] = None # enum key
|
||||||
|
contact: Contact = field(default_factory=Contact)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Dates:
|
||||||
|
start: Optional[str] = None
|
||||||
|
end: Optional[str] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParticipationFaculties:
|
||||||
|
inf: Optional[bool] = None
|
||||||
|
esb: Optional[bool] = None
|
||||||
|
ls: Optional[bool] = None
|
||||||
|
tec: Optional[bool] = None
|
||||||
|
tex: Optional[bool] = None
|
||||||
|
nxt: Optional[bool] = None
|
||||||
|
open: Optional[bool] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Participation:
|
||||||
|
faculties: ParticipationFaculties = field(default_factory=ParticipationFaculties)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Cost:
|
||||||
|
name: Optional[str] = None
|
||||||
|
amountEur: Optional[float] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Totals:
|
||||||
|
requestedAmountEur: Optional[float] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VSMFlags:
|
||||||
|
aufgaben: Optional[bool] = None
|
||||||
|
individuell: Optional[bool] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VSM:
|
||||||
|
code: Optional[str] = None # enum key
|
||||||
|
flags: VSMFlags = field(default_factory=VSMFlags)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QSMFlags:
|
||||||
|
stellenfinanzierungen: Optional[bool] = None
|
||||||
|
studierende: Optional[bool] = None
|
||||||
|
individuell: Optional[bool] = None
|
||||||
|
exkursionGenehmigt: Optional[bool] = None
|
||||||
|
exkursionBezuschusst: Optional[bool] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QSM:
|
||||||
|
code: Optional[str] = None # enum key
|
||||||
|
flags: QSMFlags = field(default_factory=QSMFlags)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Financing:
|
||||||
|
vsm: Optional[VSM] = None
|
||||||
|
qsm: Optional[QSM] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Project:
|
||||||
|
name: Optional[str] = None
|
||||||
|
dates: Dates = field(default_factory=Dates)
|
||||||
|
participants: Optional[int] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
participation: Participation = field(default_factory=Participation)
|
||||||
|
costs: List[Cost] = field(default_factory=list)
|
||||||
|
totals: Totals = field(default_factory=Totals)
|
||||||
|
financing: Financing = field(default_factory=Financing)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Attachments:
|
||||||
|
comparativeOffers: Optional[bool] = None
|
||||||
|
fakultaet: Optional[bool] = None # only in QSM variant
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WarningInfo:
|
||||||
|
notSupported: Optional[str] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RootPayload:
|
||||||
|
pa: Any = field(default_factory=dict) # will hold applicant + project + attachments
|
||||||
|
warning: WarningInfo = field(default_factory=WarningInfo)
|
||||||
|
_validation: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# Mapping helpers
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def _to_bool(v: Any) -> bool:
|
||||||
|
if isinstance(v, str):
|
||||||
|
return v not in ("/Off", "")
|
||||||
|
return bool(v)
|
||||||
|
|
||||||
|
def _to_int(v: Any) -> Optional[int]:
|
||||||
|
if v in _PLACEHOLDER_VALUES:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(str(v).strip())
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
return int(float(str(v).strip().replace(",", ".").replace(" ", "")))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _to_float_de(v: Any) -> Optional[float]:
|
||||||
|
if v in _PLACEHOLDER_VALUES:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
s = str(v).replace(".", "").replace(" ", "").replace(",", ".")
|
||||||
|
return float(s)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _to_str(v: Any) -> Optional[str]:
|
||||||
|
if v is None:
|
||||||
|
return None
|
||||||
|
s = str(v)
|
||||||
|
if s.strip() in _PLACEHOLDER_VALUES:
|
||||||
|
return None
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _from_enum(v: Any, pairs: Iterable[Tuple[str, str]]) -> Optional[str]:
|
||||||
|
if v in _PLACEHOLDER_VALUES:
|
||||||
|
return None
|
||||||
|
s = str(v)
|
||||||
|
for key, _ in pairs:
|
||||||
|
if s == key:
|
||||||
|
return key
|
||||||
|
for key, label in pairs:
|
||||||
|
if s == label:
|
||||||
|
return key
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _coerce(v: Any, spec: Mapping[str, Any]) -> Any:
|
||||||
|
t = spec.get("type")
|
||||||
|
if t is bool:
|
||||||
|
return _to_bool(v)
|
||||||
|
if t is int:
|
||||||
|
return _to_int(v)
|
||||||
|
if t is float:
|
||||||
|
return _to_float_de(v)
|
||||||
|
if t is str:
|
||||||
|
return _to_str(v)
|
||||||
|
if t == "enum":
|
||||||
|
return _from_enum(v, spec.get("values", []))
|
||||||
|
return None if v in _PLACEHOLDER_VALUES else v
|
||||||
|
|
||||||
|
_key_index_re = re.compile(r"([^\[\]]+)\[(\d+)\]")
|
||||||
|
|
||||||
|
def _set_nested(root: Dict[str, Any], dotted: str, value: Any) -> None:
|
||||||
|
parts = dotted.split(".")
|
||||||
|
curr: Any = root
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
m = _key_index_re.fullmatch(part)
|
||||||
|
is_last = i == len(parts) - 1
|
||||||
|
if m:
|
||||||
|
k, idx_str = m.group(1), m.group(2)
|
||||||
|
idx = int(idx_str)
|
||||||
|
if k not in curr or not isinstance(curr.get(k), list):
|
||||||
|
curr[k] = []
|
||||||
|
lst = curr[k]
|
||||||
|
while len(lst) <= idx:
|
||||||
|
lst.append({})
|
||||||
|
if is_last:
|
||||||
|
lst[idx] = value
|
||||||
|
else:
|
||||||
|
if not isinstance(lst[idx], dict):
|
||||||
|
lst[idx] = {}
|
||||||
|
curr = lst[idx]
|
||||||
|
else:
|
||||||
|
if is_last:
|
||||||
|
curr[part] = value
|
||||||
|
else:
|
||||||
|
if part not in curr or not isinstance(curr[part], dict):
|
||||||
|
curr[part] = {}
|
||||||
|
curr = curr[part]
|
||||||
|
|
||||||
|
def _merge_mapping(variant: str) -> Dict[str, Dict[str, Any]]:
|
||||||
|
v = (variant or "").strip().upper()
|
||||||
|
if v == "QSM":
|
||||||
|
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_QSM}
|
||||||
|
if v == "VSM":
|
||||||
|
return {**TEXT_MAPPING_COMMON, **TEXT_MAPPING_VSM}
|
||||||
|
return dict(TEXT_MAPPING_COMMON)
|
||||||
|
|
||||||
|
_cost_name_pat = re.compile(r"^pa-cost-(\d+)-name$")
|
||||||
|
_cost_amt_pat = re.compile(r"^pa-cost-(\d+)-amount-euro$")
|
||||||
|
|
||||||
|
def detect_variant(form_fields: Mapping[str, Any]) -> str:
|
||||||
|
"""Best-effort variant detection from raw PDF fields."""
|
||||||
|
keys = set(form_fields.keys())
|
||||||
|
if "pa-qsm-financing" in keys:
|
||||||
|
return "QSM"
|
||||||
|
if "pa-vsm-financing" in keys:
|
||||||
|
return "VSM"
|
||||||
|
return "COMMON"
|
||||||
|
|
||||||
|
|
||||||
|
def map_form_to_payload(form_json: Dict[str, Dict[str, Any]], variant: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Map PDF-like form JSON (fieldName -> dict with '/V', etc.) to nested payload
|
||||||
|
using TEXT_MAPPING_* dicts. Unknown/empty fields are skipped.
|
||||||
|
"""
|
||||||
|
mapping = _merge_mapping(variant)
|
||||||
|
out: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
# Pre-collect costs
|
||||||
|
costs_tmp: Dict[int, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
# First pass
|
||||||
|
for field_name, meta in form_json.items():
|
||||||
|
raw_val = meta.get("/V")
|
||||||
|
|
||||||
|
# Costs pattern
|
||||||
|
m_name = _cost_name_pat.match(field_name)
|
||||||
|
m_amt = _cost_amt_pat .match(field_name)
|
||||||
|
|
||||||
|
if m_name:
|
||||||
|
idx = int(m_name.group(1)) # 1..24 -> zero-based
|
||||||
|
costs_tmp.setdefault(idx - 1, {})["name"] = _to_str(raw_val)
|
||||||
|
continue
|
||||||
|
if m_amt:
|
||||||
|
idx = int(m_amt.group(1))
|
||||||
|
costs_tmp.setdefault(idx - 1, {})["amountEur"] = _to_float_de(raw_val)
|
||||||
|
continue
|
||||||
|
|
||||||
|
spec = mapping.get(field_name)
|
||||||
|
if not spec:
|
||||||
|
continue
|
||||||
|
coerced = _coerce(raw_val, spec)
|
||||||
|
include = isinstance(coerced, bool) or coerced not in (None, "", [])
|
||||||
|
if not include:
|
||||||
|
continue
|
||||||
|
target = spec["target-key"]
|
||||||
|
_set_nested(out, target, coerced)
|
||||||
|
|
||||||
|
# Costs into payload (skip empty rows)
|
||||||
|
for idx, row in sorted(costs_tmp.items()):
|
||||||
|
if row.get("name") is None and row.get("amountEur") is None:
|
||||||
|
continue
|
||||||
|
_set_nested(out, f"pa.project.costs[{idx}]", row)
|
||||||
|
|
||||||
|
# Required check
|
||||||
|
missing = []
|
||||||
|
for fname, spec in mapping.items():
|
||||||
|
if not spec.get("required"):
|
||||||
|
continue
|
||||||
|
tkey = spec["target-key"]
|
||||||
|
cursor = out
|
||||||
|
ok = True
|
||||||
|
for part in tkey.split("."):
|
||||||
|
mm = _key_index_re.fullmatch(part)
|
||||||
|
if mm:
|
||||||
|
k, idx = mm.group(1), int(mm.group(2))
|
||||||
|
if k not in cursor or not isinstance(cursor[k], list) or len(cursor[k]) <= idx:
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
cursor = cursor[k][idx]
|
||||||
|
else:
|
||||||
|
if part not in cursor:
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
cursor = cursor[part]
|
||||||
|
if not ok:
|
||||||
|
missing.append((fname, tkey))
|
||||||
|
if missing:
|
||||||
|
out.setdefault("_validation", {})["missingRequired"] = missing
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# Builders to dataclasses
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def _get(d: Mapping[str, Any], path: str, default=None):
|
||||||
|
curr = d
|
||||||
|
for part in path.split("."):
|
||||||
|
if not isinstance(curr, Mapping) or part not in curr:
|
||||||
|
return default
|
||||||
|
curr = curr[part]
|
||||||
|
return curr
|
||||||
|
|
||||||
|
def payload_to_model(payload: Dict[str, Any]) -> RootPayload:
|
||||||
|
# Build Applicant
|
||||||
|
applicant_dict = _get(payload, "pa.applicant", {}) or {}
|
||||||
|
applicant = Applicant(
|
||||||
|
type=applicant_dict.get("type"),
|
||||||
|
institution=Institution(
|
||||||
|
name=_get(applicant_dict, "institution.name"),
|
||||||
|
type=_get(applicant_dict, "institution.type"),
|
||||||
|
),
|
||||||
|
name=Name(
|
||||||
|
first=_get(applicant_dict, "name.first"),
|
||||||
|
last=_get(applicant_dict, "name.last"),
|
||||||
|
),
|
||||||
|
course=applicant_dict.get("course"),
|
||||||
|
role=applicant_dict.get("role"),
|
||||||
|
contact=Contact(
|
||||||
|
email=_get(applicant_dict, "contact.email"),
|
||||||
|
phone=_get(applicant_dict, "contact.phone"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Project
|
||||||
|
project_dict = _get(payload, "pa.project", {}) or {}
|
||||||
|
costs = []
|
||||||
|
for c in project_dict.get("costs", []) or []:
|
||||||
|
if not isinstance(c, Mapping):
|
||||||
|
continue
|
||||||
|
costs.append(Cost(name=c.get("name"), amountEur=c.get("amountEur")))
|
||||||
|
|
||||||
|
# Financing
|
||||||
|
vsm_dict = _get(project_dict, "financing.vsm", {}) or {}
|
||||||
|
vsm = None
|
||||||
|
if vsm_dict:
|
||||||
|
vsm = VSM(
|
||||||
|
code=vsm_dict.get("code"),
|
||||||
|
flags=VSMFlags(
|
||||||
|
aufgaben=_get(vsm_dict, "flags.aufgaben"),
|
||||||
|
individuell=_get(vsm_dict, "flags.individuell"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
qsm_dict = _get(project_dict, "financing.qsm", {}) or {}
|
||||||
|
qsm = None
|
||||||
|
if qsm_dict:
|
||||||
|
qsm = QSM(
|
||||||
|
code=qsm_dict.get("code"),
|
||||||
|
flags=QSMFlags(
|
||||||
|
stellenfinanzierungen=_get(qsm_dict, "flags.stellenfinanzierungen"),
|
||||||
|
studierende=_get(qsm_dict, "flags.studierende"),
|
||||||
|
individuell=_get(qsm_dict, "flags.individuell"),
|
||||||
|
exkursionGenehmigt=_get(qsm_dict, "flags.exkursionGenehmigt"),
|
||||||
|
exkursionBezuschusst=_get(qsm_dict, "flags.exkursionBezuschusst"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
project = Project(
|
||||||
|
name=project_dict.get("name"),
|
||||||
|
dates=Dates(
|
||||||
|
start=_get(project_dict, "dates.start"),
|
||||||
|
end=_get(project_dict, "dates.end"),
|
||||||
|
),
|
||||||
|
participants=project_dict.get("participants"),
|
||||||
|
description=project_dict.get("description"),
|
||||||
|
participation=Participation(
|
||||||
|
faculties=ParticipationFaculties(
|
||||||
|
inf=_get(project_dict, "participation.faculties.inf"),
|
||||||
|
esb=_get(project_dict, "participation.faculties.esb"),
|
||||||
|
ls=_get(project_dict, "participation.faculties.ls"),
|
||||||
|
tec=_get(project_dict, "participation.faculties.tec"),
|
||||||
|
tex=_get(project_dict, "participation.faculties.tex"),
|
||||||
|
nxt=_get(project_dict, "participation.faculties.nxt"),
|
||||||
|
open=_get(project_dict, "participation.faculties.open"),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
costs=costs,
|
||||||
|
totals=Totals(requestedAmountEur=_get(project_dict, "totals.requestedAmountEur")),
|
||||||
|
financing=Financing(vsm=vsm, qsm=qsm),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Attachments
|
||||||
|
attachments_dict = _get(payload, "pa.attachments", {}) or {}
|
||||||
|
attachments = Attachments(
|
||||||
|
comparativeOffers=attachments_dict.get("comparativeOffers"),
|
||||||
|
fakultaet=attachments_dict.get("fakultaet"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Warning
|
||||||
|
warning_dict = payload.get("warning", {}) or {}
|
||||||
|
warning = WarningInfo(notSupported=warning_dict.get("notSupported"))
|
||||||
|
|
||||||
|
root = RootPayload(
|
||||||
|
pa={
|
||||||
|
"applicant": asdict(applicant),
|
||||||
|
"project": asdict(project),
|
||||||
|
"attachments": asdict(attachments),
|
||||||
|
},
|
||||||
|
warning=warning,
|
||||||
|
_validation=payload.get("_validation", {}),
|
||||||
|
)
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# PDF reading + end-to-end
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def read_pdf_fields(pdf_file: str) -> Dict[str, Dict[str, Any]]:
|
||||||
|
with open(pdf_file, "rb") as f:
|
||||||
|
reader = PyPDF2.PdfReader(f)
|
||||||
|
fields = reader.get_fields() or {}
|
||||||
|
# ensure dict[str, dict] and keep only keys we care about
|
||||||
|
return {k: (v or {}) for k, v in fields.items()}
|
||||||
|
|
||||||
|
def pdf_to_payload(pdf_file: str, variant: Optional[str] = None) -> RootPayload:
|
||||||
|
"""
|
||||||
|
Extract, map, and convert to dataclass model.
|
||||||
|
"""
|
||||||
|
form_fields = read_pdf_fields(pdf_file)
|
||||||
|
v = variant or detect_variant(form_fields)
|
||||||
|
mapped = map_form_to_payload(form_fields, v)
|
||||||
|
return payload_to_model(mapped)
|
||||||
|
|
||||||
|
def pdf_to_json(pdf_file: str, variant: Optional[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
Convenience: return the structured JSON string of the mapped payload.
|
||||||
|
"""
|
||||||
|
model = pdf_to_payload(pdf_file, variant=variant)
|
||||||
|
return json.dumps(asdict(model), ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CLI
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = ArgumentParser(description="Extract PDF Form Data and Convert to structured JSON")
|
||||||
|
parser.add_argument("pdf_file", help="Path to the PDF file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--variant",
|
||||||
|
choices=["QSM", "VSM", "COMMON", "AUTO", "auto"],
|
||||||
|
default="AUTO",
|
||||||
|
help="Form variant (default: AUTO)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
v = None if args.variant.upper() == "AUTO" else args.variant.upper()
|
||||||
|
print(pdf_to_json(args.pdf_file, variant=v))
|
||||||
Loading…
Reference in New Issue
Block a user