finances/migration/build_rebuild_dataset.py
Dane Sabo 26fb19ca9a Migration runbook + rebuild tooling; 10 PNC/income/Don't Know rules
- migration/README.md: cold-start rebuild runbook (reconciliation gate,
  classification rules, transfer pairing, investment policy, execution order)
- migration/build_rebuild_dataset.py: consolidated 3-QFX builder with PNC-
  owned transfers, counterpart pairing & drop, per-account reconciliation
- migration/rebuild_clusters.{json,md}: clustering proposal for the rebuild
- migration/rebuild_review.html: read-only browser review for the 1017-txn
  rebuild plan (transfers under PNC, category fixes baked in)
- migration/{pnc_review,review_preview_mixed}.html: earlier UI previews
- merchant_map.json: add 10 settled deterministic rules (Duquesne Light,
  Pitt Salary, Interest Payment, IRS, Pitt Tuition, Daily Cash Adjustment,
  ATM Surcharge/Yardi/Venmo/Zelle->Don't Know) so the skill stops flagging
  pre-classified PNC lines as UNMATCHED
2026-05-25 18:54:50 -04:00

117 lines
5.9 KiB
Python

"""Build the full rebuild dataset from the 3 QFX (READ-ONLY).
Emits one normalized.json (the skill's schema) for ALL of PNC + Apple +
Costco, with:
- transfers OWNED BY THE PNC LEG (PNC date + FITID authoritative); the
Apple PAYMENT lines and Costco positive AUTOPAY lines are the
counterparts and are DROPPED (paired by amount, +/- 6 days).
- PNC classified per the runbook (income / expense / Don't Know / special).
- Apple/Costco: negative = withdrawal (merchant), positive = deposit
(refund). merchant_map matching is left to firefly_import.py downstream.
- per-account reconciliation: opening + sum(its kept lines) must == QFX
ledger, else abort (no silent data loss).
Nothing is posted. Output feeds `firefly_import.py --emit-plan/--review-html`.
"""
import re, json, hashlib, sys
from collections import Counter
D = "/Users/danesabo/Documents/Finances/EXPORTS/-MAY172026"
SRC = {
"PNC Checking": (f"{D}/PNC7552Aug012025-May152025.QFX", "pnc"),
"Apple Credit Card": (f"{D}/Apple Card Transactions Aug 01 2025 - May 17 2026.qfx", "apple"),
"Costco Visa Card": (f"{D}/CitiCostcoCard Aug012025-May172025.QFX","costco"),
}
def parse(path):
t = open(path, encoding="latin-1", errors="replace").read()
m = re.search(r"<LEDGERBAL>.*?<BALAMT>([^<\r\n]*)", t, re.S | re.I)
ledger = float(m.group(1))
blocks = re.findall(r"<STMTTRN>(.*?)(?=<STMTTRN>|</BANKTRANLIST>)", t, re.S | re.I)
def g(b, k):
mm = re.search(rf"<{k}>([^<\r\n]*)", b, re.I)
return mm.group(1).strip() if mm else ""
out = []
for b in blocks:
out.append({"date": g(b, "DTPOSTED")[:8], "amt": float(g(b, "TRNAMT")),
"ttype": g(b, "TRNTYPE").upper(),
"desc": (g(b, "NAME") + " " + g(b, "MEMO")).strip(),
"fitid": g(b, "FITID")})
return ledger, out
def iso(d): # YYYYMMDD -> YYYY-MM-DD
return f"{d[:4]}-{d[4:6]}-{d[6:8]}" if len(d) >= 8 else d
# ---- PNC classification (runbook) ---------------------------------------
def classify_pnc(desc, amt):
d = desc.upper()
if "APPLECARD GSBANK PAYMENT" in d: return ("transfer", "Apple Credit Card")
if "CITI AUTOPAY PAYMENT" in d: return ("transfer", "Costco Visa Card")
if "SCHWAB BROKERAGE MONEYLINK" in d:
# amount disambiguation per the Schwab JSONs
return ("transfer", "Schwab Savings" if abs(amt) in (5000.0, 3550.0)
else "Schwab Stocks")
if "ATM WITHDRAWAL" in d: return ("transfer", "Cash")
if "CARVANA PAYOUT" in d: return ("transfer", "Illiquid Assets")
if "ATM DEPOSIT" in d and abs(amt) > 10000: return ("transfer", "Coverdell")
if "CAPITAL ONE TRANSFER" in d: return ("transfer", "Capital One")
if "UNIV PITTSBURGH" in d and ("PAYROLL" in d or "SALARY" in d):
return ("deposit", "Pitt Salary")
if "INTEREST PAYMENT" in d: return ("deposit", "Interest Income")
if "IRS TREAS 310" in d: return ("deposit", "IRS Refund")
if "DUQUESNE LIGHT" in d: return ("withdrawal", "Duquesne Light")
if "COMPEER" in d: return ("withdrawal", "Compeer Investments")
if "PITT TUITION" in d: return ("withdrawal", "University of Pittsburgh")
if any(k in d for k in ("VENMO CASHOUT","CASH APP","ZEL FROM","ATM SURCHARGE","YARDI")):
return ("dontknow", "Don't Know")
return ("raw", None) # leave to merchant_map / review downstream
records, recon, dropped = [], {}, Counter()
for acct, (path, tag) in SRC.items():
ledger, txns = parse(path)
s = round(sum(t["amt"] for t in txns), 2)
opening = round(ledger - s, 2)
recon[acct] = {"ledger": ledger, "sum": s, "opening": opening,
"ties": abs(opening + s - ledger) < 0.01}
for t in txns:
amt, d = t["amt"], t["desc"]
ext = f"{tag}:{t['fitid'] or hashlib.sha1((iso(t['date'])+d+str(amt)).encode()).hexdigest()[:16]}"
if acct == "Apple Credit Card" and t["ttype"] == "PAYMENT":
dropped["apple_payment(paired->PNC)"] += 1; continue
if acct == "Costco Visa Card" and amt > 0 and "AUTOPAY" in d.upper():
dropped["costco_autopay(paired->PNC)"] += 1; continue
rec = {"date": iso(t["date"]), "amount": f"{abs(amt):.2f}",
"description": d, "asset_account": acct, "source_tag": tag,
"source_txn_id": t["fitid"] or None, "currency_code": "USD"}
if acct == "PNC Checking":
kind, target = classify_pnc(d, amt)
if kind == "transfer":
rec["type"] = "transfer"
if amt < 0: rec["destination_account"] = target
else: rec["type"] = "transfer"; rec["asset_account"] = target; rec["destination_account"] = "PNC Checking"
elif kind in ("deposit", "withdrawal"):
rec["type"] = kind; rec["_canonical"] = target
elif kind == "dontknow":
rec["type"] = "withdrawal" if amt < 0 else "deposit"
rec["_canonical"] = "Don't Know"
else:
rec["type"] = "withdrawal" if amt < 0 else "deposit"
else:
rec["type"] = "withdrawal" if amt < 0 else "deposit"
records.append(rec)
print("=== RECONCILIATION (must all tie) ===")
ok = True
for a, r in recon.items():
flag = "OK" if r["ties"] else "*** MISMATCH ***"
ok &= r["ties"]
print(f" {a:20} ledger {r['ledger']:>11,.2f} Σ {r['sum']:>11,.2f} "
f"opening {r['opening']:>11,.2f} {flag}")
print("dropped (paired counterparts):", dict(dropped))
print(f"normalized records: {len(records)}")
if not ok:
print("ABORT: a reconciliation does not tie.", file=sys.stderr); sys.exit(1)
json.dump(records, open("/tmp/rebuild_normalized.json", "w"), indent=1)
json.dump(recon, open("/tmp/rebuild_recon.json", "w"), indent=1)
print("wrote /tmp/rebuild_normalized.json")