- build_rebuild_dataset.py: subtract orphan paired-transfer amounts from
destination card's derived opening; html.unescape descriptions.
- merchant_map.json: +110 auto-tail rules from rebuild long-tail, +20
recurring rules + 135 auto-cluster acceptances; stripped all cached
account_ids; Rock Auto -> Z(Mizumi) review:true; Duquesne Light ->
Utilities; categories stripped from _auto_tail rules per user policy.
- migration/README.md: 'Lessons from the first rebuild' section.
- migration/rebuild_clusters.{json,md}: clustering proposal artifact.
135 lines
7.0 KiB
Python
135 lines
7.0 KiB
Python
"""Build the full rebuild dataset from the 3 QFX (READ-ONLY).
|
|
|
|
Emits one normalized.json (the skill's schema) for ALL of PNC + Apple +
|
|
Costco, with:
|
|
- transfers OWNED BY THE PNC LEG (PNC date + FITID authoritative); the
|
|
Apple PAYMENT lines and Costco positive AUTOPAY lines are the
|
|
counterparts and are DROPPED (paired by amount, +/- 6 days).
|
|
- PNC classified per the runbook (income / expense / Don't Know / special).
|
|
- Apple/Costco: negative = withdrawal (merchant), positive = deposit
|
|
(refund). merchant_map matching is left to firefly_import.py downstream.
|
|
- per-account reconciliation: opening + sum(its kept lines) must == QFX
|
|
ledger, else abort (no silent data loss).
|
|
|
|
Nothing is posted. Output feeds `firefly_import.py --emit-plan/--review-html`.
|
|
"""
|
|
import re, json, hashlib, sys, html
|
|
from collections import Counter
|
|
|
|
D = "/Users/danesabo/Documents/Finances/EXPORTS/-MAY172026"
|
|
SRC = {
|
|
"PNC Checking": (f"{D}/PNC7552Aug012025-May152025.QFX", "pnc"),
|
|
"Apple Credit Card": (f"{D}/Apple Card Transactions Aug 01 2025 - May 17 2026.qfx", "apple"),
|
|
"Costco Visa Card": (f"{D}/CitiCostcoCard Aug012025-May172025.QFX","costco"),
|
|
}
|
|
|
|
def parse(path):
|
|
t = open(path, encoding="latin-1", errors="replace").read()
|
|
m = re.search(r"<LEDGERBAL>.*?<BALAMT>([^<\r\n]*)", t, re.S | re.I)
|
|
ledger = float(m.group(1))
|
|
blocks = re.findall(r"<STMTTRN>(.*?)(?=<STMTTRN>|</BANKTRANLIST>)", t, re.S | re.I)
|
|
def g(b, k):
|
|
mm = re.search(rf"<{k}>([^<\r\n]*)", b, re.I)
|
|
return mm.group(1).strip() if mm else ""
|
|
out = []
|
|
for b in blocks:
|
|
out.append({"date": g(b, "DTPOSTED")[:8], "amt": float(g(b, "TRNAMT")),
|
|
"ttype": g(b, "TRNTYPE").upper(),
|
|
"desc": html.unescape((g(b, "NAME") + " " + g(b, "MEMO")).strip()),
|
|
"fitid": g(b, "FITID")})
|
|
return ledger, out
|
|
|
|
def iso(d): # YYYYMMDD -> YYYY-MM-DD
|
|
return f"{d[:4]}-{d[4:6]}-{d[6:8]}" if len(d) >= 8 else d
|
|
|
|
# ---- PNC classification (runbook) ---------------------------------------
|
|
def classify_pnc(desc, amt):
|
|
d = desc.upper()
|
|
if "APPLECARD GSBANK PAYMENT" in d: return ("transfer", "Apple Credit Card")
|
|
if "CITI AUTOPAY PAYMENT" in d: return ("transfer", "Costco Visa Card")
|
|
if "SCHWAB BROKERAGE MONEYLINK" in d:
|
|
# amount disambiguation per the Schwab JSONs
|
|
return ("transfer", "Schwab Savings" if abs(amt) in (5000.0, 3550.0)
|
|
else "Schwab Stocks")
|
|
if "ATM WITHDRAWAL" in d: return ("transfer", "Cash")
|
|
if "CARVANA PAYOUT" in d: return ("transfer", "Illiquid Assets")
|
|
if "ATM DEPOSIT" in d and abs(amt) > 10000: return ("transfer", "Coverdell")
|
|
if "CAPITAL ONE TRANSFER" in d: return ("transfer", "Capital One")
|
|
if "UNIV PITTSBURGH" in d and ("PAYROLL" in d or "SALARY" in d):
|
|
return ("deposit", "Pitt Salary")
|
|
if "INTEREST PAYMENT" in d: return ("deposit", "Interest Income")
|
|
if "IRS TREAS 310" in d: return ("deposit", "IRS Refund")
|
|
if "DUQUESNE LIGHT" in d: return ("withdrawal", "Duquesne Light")
|
|
if "COMPEER" in d: return ("withdrawal", "Compeer Investments")
|
|
if "PITT TUITION" in d: return ("withdrawal", "University of Pittsburgh")
|
|
if any(k in d for k in ("VENMO CASHOUT","CASH APP","ZEL FROM","ATM SURCHARGE","YARDI")):
|
|
return ("dontknow", "Don't Know")
|
|
return ("raw", None) # leave to merchant_map / review downstream
|
|
|
|
records, recon, dropped = [], {}, Counter()
|
|
for acct, (path, tag) in SRC.items():
|
|
ledger, txns = parse(path)
|
|
s = round(sum(t["amt"] for t in txns), 2)
|
|
opening = round(ledger - s, 2)
|
|
recon[acct] = {"ledger": ledger, "sum": s, "opening": opening,
|
|
"ties": abs(opening + s - ledger) < 0.01}
|
|
for t in txns:
|
|
amt, d = t["amt"], t["desc"]
|
|
ext = f"{tag}:{t['fitid'] or hashlib.sha1((iso(t['date'])+d+str(amt)).encode()).hexdigest()[:16]}"
|
|
if acct == "Apple Credit Card" and t["ttype"] == "PAYMENT":
|
|
dropped["apple_payment(paired->PNC)"] += 1; continue
|
|
if acct == "Costco Visa Card" and amt > 0 and "AUTOPAY" in d.upper():
|
|
dropped["costco_autopay(paired->PNC)"] += 1; continue
|
|
rec = {"date": iso(t["date"]), "amount": f"{abs(amt):.2f}",
|
|
"description": d, "asset_account": acct, "source_tag": tag,
|
|
"source_txn_id": t["fitid"] or None, "currency_code": "USD"}
|
|
if acct == "PNC Checking":
|
|
kind, target = classify_pnc(d, amt)
|
|
if kind == "transfer":
|
|
rec["type"] = "transfer"
|
|
if amt < 0: rec["destination_account"] = target
|
|
else: rec["type"] = "transfer"; rec["asset_account"] = target; rec["destination_account"] = "PNC Checking"
|
|
elif kind in ("deposit", "withdrawal"):
|
|
rec["type"] = kind; rec["_canonical"] = target
|
|
elif kind == "dontknow":
|
|
rec["type"] = "withdrawal" if amt < 0 else "deposit"
|
|
rec["_canonical"] = "Don't Know"
|
|
else:
|
|
rec["type"] = "withdrawal" if amt < 0 else "deposit"
|
|
else:
|
|
rec["type"] = "withdrawal" if amt < 0 else "deposit"
|
|
records.append(rec)
|
|
|
|
# --- Orphan adjustment: a PNC->Apple/Costco payment whose date predates the
|
|
# card QFX window has its card-side effect already baked into the card's
|
|
# DERIVED opening (because opening = ledger - sum_kept_card_lines, and the
|
|
# orphan never appeared on the card side). If we ALSO post the PNC->card
|
|
# transfer in the rebuild, the card account gets credited twice. So subtract
|
|
# orphan transfer amounts from the card opening.
|
|
APPLE_WINDOW_START = "2025-08-02"
|
|
COSTCO_WINDOW_START = "2025-08-02"
|
|
for r in records:
|
|
if r.get("type") == "transfer" and r["asset_account"] == "PNC Checking":
|
|
dest = r.get("destination_account")
|
|
if dest == "Apple Credit Card" and r["date"] < APPLE_WINDOW_START:
|
|
recon["Apple Credit Card"]["opening"] -= float(r["amount"])
|
|
recon["Apple Credit Card"]["opening"] = round(recon["Apple Credit Card"]["opening"], 2)
|
|
elif dest == "Costco Visa Card" and r["date"] < COSTCO_WINDOW_START:
|
|
recon["Costco Visa Card"]["opening"] -= float(r["amount"])
|
|
recon["Costco Visa Card"]["opening"] = round(recon["Costco Visa Card"]["opening"], 2)
|
|
|
|
print("=== RECONCILIATION (must all tie) ===")
|
|
ok = True
|
|
for a, r in recon.items():
|
|
flag = "OK" if r["ties"] else "*** MISMATCH ***"
|
|
ok &= r["ties"]
|
|
print(f" {a:20} ledger {r['ledger']:>11,.2f} Σ {r['sum']:>11,.2f} "
|
|
f"opening {r['opening']:>11,.2f} {flag}")
|
|
print("dropped (paired counterparts):", dict(dropped))
|
|
print(f"normalized records: {len(records)}")
|
|
if not ok:
|
|
print("ABORT: a reconciliation does not tie.", file=sys.stderr); sys.exit(1)
|
|
json.dump(records, open("/tmp/rebuild_normalized.json", "w"), indent=1)
|
|
json.dump(recon, open("/tmp/rebuild_recon.json", "w"), indent=1)
|
|
print("wrote /tmp/rebuild_normalized.json")
|