finances/migration/build_rebuild_dataset.py
Dane Sabo e446c4097a Migration: rebuild battle-test learnings + opening-balance orphan fix
- build_rebuild_dataset.py: subtract orphan paired-transfer amounts from
  destination card's derived opening; html.unescape descriptions.
- merchant_map.json: +110 auto-tail rules from rebuild long-tail, +20
  recurring rules + 135 auto-cluster acceptances; stripped all cached
  account_ids; Rock Auto -> Z(Mizumi) review:true; Duquesne Light ->
  Utilities; categories stripped from _auto_tail rules per user policy.
- migration/README.md: 'Lessons from the first rebuild' section.
- migration/rebuild_clusters.{json,md}: clustering proposal artifact.
2026-05-25 21:05:38 -04:00

135 lines
7.0 KiB
Python

"""Build the full rebuild dataset from the 3 QFX (READ-ONLY).
Emits one normalized.json (the skill's schema) for ALL of PNC + Apple +
Costco, with:
- transfers OWNED BY THE PNC LEG (PNC date + FITID authoritative); the
Apple PAYMENT lines and Costco positive AUTOPAY lines are the
counterparts and are DROPPED (paired by amount, +/- 6 days).
- PNC classified per the runbook (income / expense / Don't Know / special).
- Apple/Costco: negative = withdrawal (merchant), positive = deposit
(refund). merchant_map matching is left to firefly_import.py downstream.
- per-account reconciliation: opening + sum(its kept lines) must == QFX
ledger, else abort (no silent data loss).
Nothing is posted. Output feeds `firefly_import.py --emit-plan/--review-html`.
"""
import re, json, hashlib, sys, html
from collections import Counter
D = "/Users/danesabo/Documents/Finances/EXPORTS/-MAY172026"
SRC = {
"PNC Checking": (f"{D}/PNC7552Aug012025-May152025.QFX", "pnc"),
"Apple Credit Card": (f"{D}/Apple Card Transactions Aug 01 2025 - May 17 2026.qfx", "apple"),
"Costco Visa Card": (f"{D}/CitiCostcoCard Aug012025-May172025.QFX","costco"),
}
def parse(path):
t = open(path, encoding="latin-1", errors="replace").read()
m = re.search(r"<LEDGERBAL>.*?<BALAMT>([^<\r\n]*)", t, re.S | re.I)
ledger = float(m.group(1))
blocks = re.findall(r"<STMTTRN>(.*?)(?=<STMTTRN>|</BANKTRANLIST>)", t, re.S | re.I)
def g(b, k):
mm = re.search(rf"<{k}>([^<\r\n]*)", b, re.I)
return mm.group(1).strip() if mm else ""
out = []
for b in blocks:
out.append({"date": g(b, "DTPOSTED")[:8], "amt": float(g(b, "TRNAMT")),
"ttype": g(b, "TRNTYPE").upper(),
"desc": html.unescape((g(b, "NAME") + " " + g(b, "MEMO")).strip()),
"fitid": g(b, "FITID")})
return ledger, out
def iso(d): # YYYYMMDD -> YYYY-MM-DD
return f"{d[:4]}-{d[4:6]}-{d[6:8]}" if len(d) >= 8 else d
# ---- PNC classification (runbook) ---------------------------------------
def classify_pnc(desc, amt):
d = desc.upper()
if "APPLECARD GSBANK PAYMENT" in d: return ("transfer", "Apple Credit Card")
if "CITI AUTOPAY PAYMENT" in d: return ("transfer", "Costco Visa Card")
if "SCHWAB BROKERAGE MONEYLINK" in d:
# amount disambiguation per the Schwab JSONs
return ("transfer", "Schwab Savings" if abs(amt) in (5000.0, 3550.0)
else "Schwab Stocks")
if "ATM WITHDRAWAL" in d: return ("transfer", "Cash")
if "CARVANA PAYOUT" in d: return ("transfer", "Illiquid Assets")
if "ATM DEPOSIT" in d and abs(amt) > 10000: return ("transfer", "Coverdell")
if "CAPITAL ONE TRANSFER" in d: return ("transfer", "Capital One")
if "UNIV PITTSBURGH" in d and ("PAYROLL" in d or "SALARY" in d):
return ("deposit", "Pitt Salary")
if "INTEREST PAYMENT" in d: return ("deposit", "Interest Income")
if "IRS TREAS 310" in d: return ("deposit", "IRS Refund")
if "DUQUESNE LIGHT" in d: return ("withdrawal", "Duquesne Light")
if "COMPEER" in d: return ("withdrawal", "Compeer Investments")
if "PITT TUITION" in d: return ("withdrawal", "University of Pittsburgh")
if any(k in d for k in ("VENMO CASHOUT","CASH APP","ZEL FROM","ATM SURCHARGE","YARDI")):
return ("dontknow", "Don't Know")
return ("raw", None) # leave to merchant_map / review downstream
records, recon, dropped = [], {}, Counter()
for acct, (path, tag) in SRC.items():
ledger, txns = parse(path)
s = round(sum(t["amt"] for t in txns), 2)
opening = round(ledger - s, 2)
recon[acct] = {"ledger": ledger, "sum": s, "opening": opening,
"ties": abs(opening + s - ledger) < 0.01}
for t in txns:
amt, d = t["amt"], t["desc"]
ext = f"{tag}:{t['fitid'] or hashlib.sha1((iso(t['date'])+d+str(amt)).encode()).hexdigest()[:16]}"
if acct == "Apple Credit Card" and t["ttype"] == "PAYMENT":
dropped["apple_payment(paired->PNC)"] += 1; continue
if acct == "Costco Visa Card" and amt > 0 and "AUTOPAY" in d.upper():
dropped["costco_autopay(paired->PNC)"] += 1; continue
rec = {"date": iso(t["date"]), "amount": f"{abs(amt):.2f}",
"description": d, "asset_account": acct, "source_tag": tag,
"source_txn_id": t["fitid"] or None, "currency_code": "USD"}
if acct == "PNC Checking":
kind, target = classify_pnc(d, amt)
if kind == "transfer":
rec["type"] = "transfer"
if amt < 0: rec["destination_account"] = target
else: rec["type"] = "transfer"; rec["asset_account"] = target; rec["destination_account"] = "PNC Checking"
elif kind in ("deposit", "withdrawal"):
rec["type"] = kind; rec["_canonical"] = target
elif kind == "dontknow":
rec["type"] = "withdrawal" if amt < 0 else "deposit"
rec["_canonical"] = "Don't Know"
else:
rec["type"] = "withdrawal" if amt < 0 else "deposit"
else:
rec["type"] = "withdrawal" if amt < 0 else "deposit"
records.append(rec)
# --- Orphan adjustment: a PNC->Apple/Costco payment whose date predates the
# card QFX window has its card-side effect already baked into the card's
# DERIVED opening (because opening = ledger - sum_kept_card_lines, and the
# orphan never appeared on the card side). If we ALSO post the PNC->card
# transfer in the rebuild, the card account gets credited twice. So subtract
# orphan transfer amounts from the card opening.
APPLE_WINDOW_START = "2025-08-02"
COSTCO_WINDOW_START = "2025-08-02"
for r in records:
if r.get("type") == "transfer" and r["asset_account"] == "PNC Checking":
dest = r.get("destination_account")
if dest == "Apple Credit Card" and r["date"] < APPLE_WINDOW_START:
recon["Apple Credit Card"]["opening"] -= float(r["amount"])
recon["Apple Credit Card"]["opening"] = round(recon["Apple Credit Card"]["opening"], 2)
elif dest == "Costco Visa Card" and r["date"] < COSTCO_WINDOW_START:
recon["Costco Visa Card"]["opening"] -= float(r["amount"])
recon["Costco Visa Card"]["opening"] = round(recon["Costco Visa Card"]["opening"], 2)
print("=== RECONCILIATION (must all tie) ===")
ok = True
for a, r in recon.items():
flag = "OK" if r["ties"] else "*** MISMATCH ***"
ok &= r["ties"]
print(f" {a:20} ledger {r['ledger']:>11,.2f} Σ {r['sum']:>11,.2f} "
f"opening {r['opening']:>11,.2f} {flag}")
print("dropped (paired counterparts):", dict(dropped))
print(f"normalized records: {len(records)}")
if not ok:
print("ABORT: a reconciliation does not tie.", file=sys.stderr); sys.exit(1)
json.dump(records, open("/tmp/rebuild_normalized.json", "w"), indent=1)
json.dump(recon, open("/tmp/rebuild_recon.json", "w"), indent=1)
print("wrote /tmp/rebuild_normalized.json")