#!/usr/bin/env python3
"""
Extract all tables from Kraken 1099-DA PDF into structured Excel file.
One sheet per page (pages 3-32), preserving headers and formatting.
"""

import re
import pdfplumber
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter

PDF_PATH = "/Users/bowang/.openclaw/workspace/kraken_1099da_2025.pdf"
OUT_PATH = "/Users/bowang/.openclaw/workspace/kraken_1099da_2025.xlsx"

HEADERS = [
    "Asset Code",
    "Asset Name",
    "Number of Units",
    "Unit Type",
    "Date Acquired",
    "Date Sold/Disposed",
    "Proceeds",
    "Cost/Other Basis",
    "Gain/Loss Amount",
    "Additional Info"
]

# Colors
HDR_FILL = PatternFill("solid", fgColor="1F3864")  # dark blue
HDR_FONT = Font(bold=True, color="FFFFFF", size=10)
ROW_FILL_A = PatternFill("solid", fgColor="DCE6F1")
ROW_FILL_B = PatternFill("solid", fgColor="FFFFFF")
ASSET_FILL = PatternFill("solid", fgColor="E2EFDA")  # light green for asset rows
TOTAL_FILL = PatternFill("solid", fgColor="FFF2CC")  # yellow for totals

thin = Side(style="thin", color="AAAAAA")
thin_border = Border(left=thin, right=thin, top=thin, bottom=thin)


def money(s):
    """Clean up currency string."""
    return s.strip()


def parse_summary_page(text):
    """Parse page 3 - summary table."""
    rows = []
    lines = text.split("\n")
    
    # Find the sections
    current_section = None
    section_headers_added = False
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Detect section headers
        if "Short-term Gains" in line or "Short-Term" in line:
            current_section = "Short-Term"
        elif "Long-term Gains" in line or "Long-Term" in line:
            current_section = "Long-Term"
        elif "Unknown Term" in line:
            current_section = "Unknown Term"
        
        # Parse data rows - look for lines with dollar amounts
        # Pattern: label followed by dollar amounts
        dollar_pattern = r'\$[\d,]+\.\d{2}'
        amounts = re.findall(dollar_pattern, line)
        
        if amounts and current_section:
            # Extract the label (everything before the first dollar sign)
            label_match = re.match(r'^(.+?)\s+(\$[\d,\(\)\.]+)', line)
            if label_match:
                label = label_match.group(1).strip()
                row = [current_section, label] + amounts
                rows.append(row)
    
    return rows


def parse_transaction_page(text, page_num):
    """Parse a transaction page (pages 4-32)."""
    transactions = []
    lines = text.split("\n")
    
    current_asset_code = ""
    current_asset_name = ""
    
    # Skip header boilerplate - find where actual data starts
    data_started = False
    
    # Regex for transaction rows
    # Format: [units] (1c) [date_acq or -] [date_sold] $proceeds $cost $gain_loss [Boxes info]
    # Units can have decimals
    tx_pattern = re.compile(
        r'^([\d,]+\.?\d*)\s+\(1c\)\s+'     # units (1c)
        r'(-|[\d/]+)\s+'                     # date acquired or -
        r'([\d/]+)\s+'                       # date sold
        r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})\s+'   # proceeds
        r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})\s+'   # cost basis
        r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})'      # gain/loss
        r'(.*)?$'                             # additional info
    )
    
    # Simpler approach: use word-level extraction with positions
    # Let's parse line by line
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Skip empty lines and boilerplate
        if not line or line.startswith("THIS IS") or line.startswith("This is") or \
           line.startswith("Page ") or line.startswith("negligence") or \
           line.startswith("2025") or line.startswith("Recipient BO") or \
           line.startswith("528 ") or line.startswith("REDWOOD") or \
           line.startswith("TIN ") or line.startswith("Account No") or \
           line.startswith("FATCA") or line.startswith("The 1099") or \
           line.startswith("box number") or line.startswith("Box 2") or \
           line.startswith("Box 3") or line.startswith("accuracy") or \
           "106 E. LINCOLN" in line or "CHEYENNE" in line or \
           "PAYWARD" in line or line.startswith("Phone number") or \
           "OMB No" in line or "1099-DA:" in line or "Noncovered" in line or \
           "Report on Form" in line or "Code and Name" in line or \
           "Transferred" in line or "(Box 1" in line or \
           "Details for" in line or "continue on" in line or \
           "Unchecked" in line:
            i += 1
            continue
        
        # Check if this is an asset header line (just a code and name, no numbers)
        # Asset lines look like "M8Z9JSG9Q ALGORAND" or just "ALGORAND" continuation
        # They appear before transaction rows
        
        # Try to match a transaction line
        match = tx_pattern.match(line)
        if match:
            units = match.group(1)
            date_acq = match.group(2)
            date_sold = match.group(3)
            proceeds = match.group(4)
            cost = match.group(5)
            gain = match.group(6)
            additional = match.group(7).strip() if match.group(7) else ""
            
            transactions.append({
                "asset_code": current_asset_code,
                "asset_name": current_asset_name,
                "units": units,
                "unit_type": "(1c)",
                "date_acquired": date_acq if date_acq != "-" else "",
                "date_sold": date_sold,
                "proceeds": proceeds,
                "cost_basis": cost,
                "gain_loss": gain,
                "additional": additional
            })
        else:
            # Try to detect asset name/code lines
            # These lines contain asset codes (all caps, alphanumeric) and names
            # e.g., "M8Z9JSG9Q" or "ALGORAND" or "M8Z9JSG9Q ALGORAND"
            
            # Check if line looks like an asset identifier
            # Asset code + optional name, no dollar signs
            if "$" not in line and "(" not in line or \
               (re.match(r'^[A-Z0-9]+\s*[A-Z\s]*$', line) and len(line) < 50):
                # Could be asset header
                parts = line.split()
                if parts and re.match(r'^[A-Z0-9]{5,}$', parts[0]):
                    # Has a code-like prefix
                    current_asset_code = parts[0]
                    current_asset_name = " ".join(parts[1:]) if len(parts) > 1 else parts[0]
                elif parts and all(c.isupper() or c.isspace() for c in line) and len(line) < 40:
                    # All caps - likely asset name continuation
                    if not current_asset_name:
                        current_asset_name = line
        
        i += 1
    
    return transactions


def parse_page_text_carefully(text, page_num):
    """
    More robust parser using regex to find all transaction rows.
    Handles multi-line asset names and continuation pages.
    """
    transactions = []
    
    # Split into lines and clean
    lines = [l.strip() for l in text.split("\n") if l.strip()]
    
    current_asset_code = ""
    current_asset_name = ""
    
    # The key pattern for transaction rows - they always have (1c) in them
    for line in lines:
        if "(1c)" not in line:
            # Check for asset header
            # Asset lines: no $, no (1c), usually short, all caps or code+name
            if "$" not in line and "(1c)" not in line and len(line) < 60:
                # Skip known boilerplate
                skip_words = ["THIS IS", "This is", "Page ", "negligence", "Recipient",
                              "REDWOOD", "TIN ", "Account", "FATCA", "The 1099", "box number",
                              "Box 2", "Box 3", "accuracy", "LINCOLN", "CHEYENNE", "PAYWARD",
                              "Phone", "OMB", "1099-DA", "Noncovered", "Report on", "Code and",
                              "Transferred", "(Box", "Details", "continue", "Unchecked",
                              "528 ", "106 ", "2025", "Refer to", "Wash Sale",
                              "Short-term", "Long-term", "Unknown", "Total", "Basis reported",
                              "Ordinary", "informational", "Proceeds", "Cost Basis"]
                
                should_skip = any(sw in line for sw in skip_words)
                
                if not should_skip:
                    # Parse asset identifier
                    parts = line.split()
                    if parts:
                        # Check if first token looks like a code (alphanumeric, 5+ chars, has digits)
                        if re.match(r'^[A-Z0-9]{5,}$', parts[0]) and any(c.isdigit() for c in parts[0]):
                            current_asset_code = parts[0]
                            current_asset_name = " ".join(parts[1:]) if len(parts) > 1 else ""
                        elif all(c.isupper() or c.isspace() or c == "-" for c in line) and len(parts) <= 4:
                            # All uppercase - asset name
                            current_asset_name = line
            continue
        
        # Parse transaction line containing (1c)
        # Pattern: UNITS (1c) DATE_ACQ DATE_SOLD $PROCEEDS $COST $GAIN [extra]
        # DATE_ACQ can be "-" (not acquired = inherited/unknown)
        
        # Normalize: sometimes asset code/name precedes units on same line
        # e.g., "BITCOIN 0.5 (1c) - 01/01/2025 $100 $90 $10"
        
        # Try to find unit count before (1c)
        m = re.search(
            r'([\d,]+\.?\d*)\s+\(1c\)\s+'
            r'(-|[\d/]{0,10})\s+'
            r'([\d/]{1,10})\s+'
            r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})\s+'
            r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})\s+'
            r'(\(\$[\d,]+\.\d{2}\)|\$[\d,]+\.\d{2})'
            r'(.*)?$',
            line
        )
        
        if m:
            units = m.group(1)
            date_acq = m.group(2)
            date_sold = m.group(3)
            proceeds = m.group(4)
            cost = m.group(5)
            gain = m.group(6)
            additional = m.group(7).strip() if m.group(7) else ""
            
            # Check if asset info is at start of this line (before units)
            prefix = line[:m.start()].strip()
            if prefix:
                parts = prefix.split()
                if parts and re.match(r'^[A-Z0-9]{5,}$', parts[0]) and any(c.isdigit() for c in parts[0]):
                    current_asset_code = parts[0]
                    current_asset_name = " ".join(parts[1:]) if len(parts) > 1 else ""
                elif prefix and all(c.isupper() or c.isspace() for c in prefix):
                    current_asset_name = prefix
            
            transactions.append({
                "asset_code": current_asset_code,
                "asset_name": current_asset_name,
                "units": units,
                "unit_type": "(1c)",
                "date_acquired": date_acq if date_acq != "-" else "",
                "date_sold": date_sold,
                "proceeds": proceeds,
                "cost_basis": cost,
                "gain_loss": gain,
                "additional": additional
            })
    
    return transactions


def style_header_row(ws, row_num, ncols):
    for col in range(1, ncols + 1):
        cell = ws.cell(row=row_num, column=col)
        cell.fill = HDR_FILL
        cell.font = HDR_FONT
        cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
        cell.border = thin_border


def style_data_row(ws, row_num, ncols, fill):
    for col in range(1, ncols + 1):
        cell = ws.cell(row=row_num, column=col)
        cell.fill = fill
        cell.border = thin_border
        cell.alignment = Alignment(vertical="center")


def auto_width(ws):
    for col in ws.columns:
        max_len = 0
        col_letter = get_column_letter(col[0].column)
        for cell in col:
            try:
                val = str(cell.value or "")
                if "\n" in val:
                    val = max(val.split("\n"), key=len)
                max_len = max(max_len, len(val))
            except:
                pass
        ws.column_dimensions[col_letter].width = min(max_len + 2, 35)


def main():
    wb = openpyxl.Workbook()
    wb.remove(wb.active)  # remove default sheet
    
    all_transactions = []  # for summary/all-data sheet
    
    with pdfplumber.open(PDF_PATH) as pdf:
        total_pages = len(pdf.pages)
        print(f"Total pages: {total_pages}")
        
        # Page 3 (index 2): Summary
        page3 = pdf.pages[2]
        text3 = page3.extract_text()
        
        ws_summary = wb.create_sheet(title="Page 3 - Summary")
        
        # Write summary header
        ws_summary.append(["2025 | 1099-DA Totals Summary"])
        ws_summary["A1"].font = Font(bold=True, size=13)
        ws_summary.append([])
        
        # Parse and write summary sections
        sections = [
            ("Short-Term Gains or (Losses)", [
                "Short-term",
                "Basis reported to IRS",
                "Ordinary - (basis reported to IRS)",
                "Basis not reported to IRS",
                "Ordinary - (basis not reported to IRS)",
                "Total Short-Term"
            ]),
            ("Long-Term Gains or (Losses)", [
                "Long-term",
                "Basis reported to IRS",
                "Ordinary - (basis reported to IRS)",
                "Basis not reported to IRS",
                "Ordinary - (basis not reported to IRS)",
                "Total Long-Term"
            ]),
            ("Unknown Term", [
                "Unknown",
                "Basis not reported to IRS",
                "Ordinary - (basis not reported to IRS)",
                "Total"
            ])
        ]
        
        sum_headers = ["Category", "Type", "Proceeds", "Cost Basis", "Market Discount", "Wash Sale Loss Disallowed", "Realized Gain/Loss"]
        ws_summary.append(sum_headers)
        style_header_row(ws_summary, 3, len(sum_headers))
        
        # Extract data using regex from text
        lines = text3.split("\n")
        current_category = ""
        data_row = 3
        
        dollar_re = re.compile(r'\(?\$[\d,]+\.\d{2}\)?')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            if "Short-term Gains" in line or "Short-Term" in line:
                current_category = "Short-Term"
                continue
            elif "Long-term Gains" in line or "Long-Term" in line:
                current_category = "Long-Term"
                continue
            elif line.startswith("Unknown Term"):
                current_category = "Unknown Term"
                continue
            
            amounts = dollar_re.findall(line)
            if len(amounts) >= 3 and current_category:
                # Extract label
                label = dollar_re.sub("", line).strip()
                label = re.sub(r'\s+', ' ', label).strip()
                
                data_row += 1
                row_data = [current_category, label] + amounts
                # Pad to 7 columns
                while len(row_data) < 7:
                    row_data.append("")
                ws_summary.append(row_data[:7])
                
                fill = TOTAL_FILL if "Total" in label else (ROW_FILL_A if data_row % 2 == 0 else ROW_FILL_B)
                style_data_row(ws_summary, ws_summary.max_row, 7, fill)
                if "Total" in label:
                    for col in range(1, 8):
                        ws_summary.cell(row=ws_summary.max_row, column=col).font = Font(bold=True)
        
        auto_width(ws_summary)
        ws_summary.freeze_panes = "A4"
        print("✓ Page 3 (Summary) done")
        
        # Pages 4-30 (indices 3-29): Transaction detail pages
        for page_idx in range(3, min(30, total_pages)):
            page_num = page_idx + 1
            page = pdf.pages[page_idx]
            text = page.extract_text()
            
            if not text:
                print(f"  Page {page_num}: no text, skipping")
                continue
            
            transactions = parse_page_text_carefully(text, page_num)
            
            # Create sheet
            sheet_name = f"Page {page_num}"
            ws = wb.create_sheet(title=sheet_name)
            
            # Title row
            ws.append([f"Page {page_num} - 1099-DA Transactions"])
            ws["A1"].font = Font(bold=True, size=11)
            ws.append([])
            
            # Headers
            ws.append(HEADERS)
            style_header_row(ws, 3, len(HEADERS))
            ws.freeze_panes = "A4"
            
            if transactions:
                for j, tx in enumerate(transactions):
                    row = [
                        tx["asset_code"],
                        tx["asset_name"],
                        tx["units"],
                        tx["unit_type"],
                        tx["date_acquired"],
                        tx["date_sold"],
                        tx["proceeds"],
                        tx["cost_basis"],
                        tx["gain_loss"],
                        tx["additional"]
                    ]
                    ws.append(row)
                    fill = ROW_FILL_A if j % 2 == 0 else ROW_FILL_B
                    style_data_row(ws, ws.max_row, len(HEADERS), fill)
                    all_transactions.append({"page": page_num, **tx})
                
                # Add subtotals row
                ws.append([])
                try:
                    # Sum proceeds, cost, gain/loss
                    def parse_dollar(s):
                        s = str(s).replace("$", "").replace(",", "").strip()
                        if s.startswith("(") and s.endswith(")"):
                            return -float(s[1:-1])
                        try:
                            return float(s)
                        except:
                            return 0.0
                    
                    total_proc = sum(parse_dollar(t["proceeds"]) for t in transactions)
                    total_cost = sum(parse_dollar(t["cost_basis"]) for t in transactions)
                    total_gain = sum(parse_dollar(t["gain_loss"]) for t in transactions)
                    
                    def fmt_dollar(v):
                        if v < 0:
                            return f"(${abs(v):,.2f})"
                        return f"${v:,.2f}"
                    
                    total_row = ["", f"Page {page_num} Totals ({len(transactions)} transactions)", "", "", "", "",
                                 fmt_dollar(total_proc), fmt_dollar(total_cost), fmt_dollar(total_gain), ""]
                    ws.append(total_row)
                    total_row_num = ws.max_row
                    for col in range(1, len(HEADERS) + 1):
                        cell = ws.cell(row=total_row_num, column=col)
                        cell.fill = TOTAL_FILL
                        cell.font = Font(bold=True)
                        cell.border = thin_border
                except Exception as e:
                    print(f"  Warning: could not compute totals for page {page_num}: {e}")
            else:
                ws.append(["No transactions found on this page"])
            
            auto_width(ws)
            print(f"✓ Page {page_num}: {len(transactions)} transactions")
        
        # Page 31 (index 30): Unknown Term transactions (N/A cost/gain)
        if total_pages >= 31:
            page31 = pdf.pages[30]
            text31 = page31.extract_text()
            ws31 = wb.create_sheet(title="Page 31 - Unknown Term")
            ws31.append(["Page 31 - Unknown Term Transactions"])
            ws31["A1"].font = Font(bold=True, size=11)
            ws31.append([])
            ut_headers = ["Asset Code", "Asset Name", "Number of Units", "Unit Type", "Date Acquired", "Date Sold/Disposed", "Proceeds", "Cost/Other Basis", "Gain/Loss Amount", "Additional Info", "Notes"]
            ws31.append(ut_headers)
            style_header_row(ws31, 3, len(ut_headers))
            ws31.freeze_panes = "A4"
            
            unk_transactions = []
            if text31:
                lines31 = text31.split("\n")
                # Pattern: CODE UNITS (1c) - DATE $PROCEEDS N/A [Boxes...]
                # Next line may have: ASSETNAME Box 12b: DATE
                ut_pattern = re.compile(
                    r'^([A-Z0-9]{5,})\s+([\d,]+\.?\d*)\s+\(1c\)\s+(-|[\d/]+)\s+([\d/]+)\s+(\$[\d,]+\.\d{2})\s+N/A(.*)$'
                )
                for li, line in enumerate(lines31):
                    line = line.strip()
                    m = ut_pattern.match(line)
                    if m:
                        code = m.group(1)
                        units = m.group(2)
                        date_acq = m.group(3)
                        date_sold = m.group(4)
                        proceeds = m.group(5)
                        additional = m.group(6).strip()
                        # Next line likely has the asset name
                        asset_name = ""
                        notes = ""
                        if li + 1 < len(lines31):
                            next_line = lines31[li + 1].strip()
                            # Parse: "SONGBIRD Box 12b: 02/18/2022"
                            nm = re.match(r'^([A-Z\s]+)\s+Box 12b:\s+([\d/]+)$', next_line)
                            if nm:
                                asset_name = nm.group(1).strip()
                                notes = f"Box 12b: {nm.group(2)}"
                            elif not re.search(r'\$|\(1c\)', next_line):
                                asset_name = next_line
                        
                        row = [code, asset_name, units, "(1c)",
                               date_acq if date_acq != "-" else "", date_sold,
                               proceeds, "N/A", "N/A", additional, notes]
                        ws31.append(row)
                        style_data_row(ws31, ws31.max_row, len(ut_headers), ROW_FILL_A)
                        unk_transactions.append({
                            "page": 31,
                            "asset_code": code,
                            "asset_name": asset_name,
                            "units": units,
                            "unit_type": "(1c)",
                            "date_acquired": date_acq if date_acq != "-" else "",
                            "date_sold": date_sold,
                            "proceeds": proceeds,
                            "cost_basis": "N/A",
                            "gain_loss": "N/A",
                            "additional": additional + (" | " + notes if notes else "")
                        })
                
                # Total row
                if unk_transactions:
                    ws31.append([])
                    total_proc_ut = sum(float(t["proceeds"].replace("$","").replace(",","")) for t in unk_transactions)
                    ws31.append(["", f"Total ({len(unk_transactions)} items)", "", "", "", "", f"${total_proc_ut:,.2f}", "", "", "", ""])
                    tr = ws31.max_row
                    for col in range(1, len(ut_headers)+1):
                        ws31.cell(row=tr, column=col).fill = TOTAL_FILL
                        ws31.cell(row=tr, column=col).font = Font(bold=True)
                        ws31.cell(row=tr, column=col).border = thin_border
            
            auto_width(ws31)
            all_transactions.extend(unk_transactions)
            print(f"✓ Page 31 (Unknown Term): {len(unk_transactions)} transactions")
        
        # Page 32 (index 31): Qualifying Stablecoins
        if total_pages >= 32:
            page32 = pdf.pages[31]
            text32 = page32.extract_text()
            ws32 = wb.create_sheet(title="Page 32 - Stablecoins")
            ws32.append(["Page 32 - Qualifying Stablecoins & Specified NFTs"])
            ws32["A1"].font = Font(bold=True, size=11)
            ws32.append([])
            sc_headers = ["Asset Code", "Asset Name", "Number of Transactions", "Number of Units", "Unit Type", "Proceeds", "Additional Info"]
            ws32.append(sc_headers)
            style_header_row(ws32, 3, len(sc_headers))
            ws32.freeze_panes = "A4"
            
            if text32:
                lines32 = text32.split("\n")
                # Pattern: CODE NUM_TX UNITS (1c) $PROCEEDS
                sc_pattern = re.compile(
                    r'^([A-Z0-9]{5,})\s+(\d+)\s+([\d,]+\.?\d*)\s+\(1c\)\s+(\$[\d,]+\.\d{2})(.*)$'
                )
                sc_idx = 0
                for li, line in enumerate(lines32):
                    line = line.strip()
                    m = sc_pattern.match(line)
                    if m:
                        code = m.group(1)
                        num_tx = m.group(2)
                        units = m.group(3)
                        proceeds = m.group(4)
                        additional = m.group(5).strip()
                        # Next line has asset name
                        asset_name = ""
                        if li + 1 < len(lines32):
                            next_line = lines32[li+1].strip()
                            if next_line and not re.search(r'\$|\(1c\)|Box |FOOTNOTES|THIS IS|negligence', next_line):
                                asset_name = next_line
                        
                        row = [code, asset_name, num_tx, units, "(1c)", proceeds, additional]
                        ws32.append(row)
                        style_data_row(ws32, ws32.max_row, len(sc_headers), ROW_FILL_A if sc_idx % 2 == 0 else ROW_FILL_B)
                        sc_idx += 1
            
            auto_width(ws32)
            print(f"✓ Page 32 (Stablecoins) done")

        # Page 33 (index 32) if it exists
        if total_pages >= 33:
            page33 = pdf.pages[32]
            text33 = page33.extract_text()
            ws33 = wb.create_sheet(title="Page 33 - Notes")
            ws33.append(["Page 33 - Additional Notes"])
            ws33["A1"].font = Font(bold=True, size=11)
            ws33.append([])
            if text33:
                for line in text33.split("\n"):
                    ws33.append([line])
            print("✓ Page 33 done")
    
    # Create an "All Transactions" consolidated sheet
    ws_all = wb.create_sheet(title="All Transactions", index=1)
    ws_all.append(["All Transactions - Consolidated"])
    ws_all["A1"].font = Font(bold=True, size=13)
    ws_all.append([])
    
    all_headers = ["Page"] + HEADERS
    ws_all.append(all_headers)
    style_header_row(ws_all, 3, len(all_headers))
    ws_all.freeze_panes = "A4"
    
    for j, tx in enumerate(all_transactions):
        row = [
            tx["page"],
            tx["asset_code"],
            tx["asset_name"],
            tx["units"],
            tx["unit_type"],
            tx["date_acquired"],
            tx["date_sold"],
            tx["proceeds"],
            tx["cost_basis"],
            tx["gain_loss"],
            tx["additional"]
        ]
        ws_all.append(row)
        fill = ROW_FILL_A if j % 2 == 0 else ROW_FILL_B
        style_data_row(ws_all, ws_all.max_row, len(all_headers), fill)
    
    # Grand total row
    if all_transactions:
        def parse_dollar(s):
            s = str(s).replace("$", "").replace(",", "").strip()
            if s.startswith("(") and s.endswith(")"):
                return -float(s[1:-1])
            try:
                return float(s)
            except:
                return 0.0
        
        def fmt_dollar(v):
            if v < 0:
                return f"(${abs(v):,.2f})"
            return f"${v:,.2f}"
        
        total_proc = sum(parse_dollar(t["proceeds"]) for t in all_transactions)
        total_cost = sum(parse_dollar(t["cost_basis"]) for t in all_transactions)
        total_gain = sum(parse_dollar(t["gain_loss"]) for t in all_transactions)
        
        ws_all.append([])
        grand_row = ["", "", f"GRAND TOTAL ({len(all_transactions)} transactions)", "", "", "", "",
                     fmt_dollar(total_proc), fmt_dollar(total_cost), fmt_dollar(total_gain), ""]
        ws_all.append(grand_row)
        gr = ws_all.max_row
        for col in range(1, len(all_headers) + 1):
            cell = ws_all.cell(row=gr, column=col)
            cell.fill = PatternFill("solid", fgColor="1F3864")
            cell.font = Font(bold=True, color="FFFFFF")
            cell.border = thin_border
    
    auto_width(ws_all)
    
    wb.save(OUT_PATH)
    print(f"\n✅ Saved to: {OUT_PATH}")
    print(f"   Total transactions extracted: {len(all_transactions)}")
    
    # Validation
    print("\n📊 Validation:")
    print("  Expected from summary (page 3):")
    print("    Short-Term Proceeds: $558,062.90, Cost: $539,884.07, Gain: $18,178.73")
    print("    Long-Term  Proceeds: $30,263.00,  Cost: $3,984.66,  Gain: $26,278.33")
    print("    Unknown    Proceeds: $47.22,       Cost: $0.00,      Gain: $0.00")
    
    def parse_dollar(s):
        s = str(s).replace("$", "").replace(",", "").strip()
        if s.startswith("(") and s.endswith(")"):
            return -float(s[1:-1])
        try:
            return float(s)
        except:
            return 0.0
    
    total_proc = sum(parse_dollar(t["proceeds"]) for t in all_transactions)
    total_cost = sum(parse_dollar(t["cost_basis"]) for t in all_transactions)
    total_gain = sum(parse_dollar(t["gain_loss"]) for t in all_transactions)
    
    print(f"\n  Extracted totals:")
    print(f"    Total Proceeds:  ${total_proc:,.2f}")
    print(f"    Total Cost:      ${total_cost:,.2f}")
    print(f"    Total Gain/Loss: ${total_gain:,.2f}")
    
    expected_proc = 558062.90 + 30263.00 + 47.22
    expected_cost = 539884.07 + 3984.66 + 0.00
    expected_gain = 18178.73 + 26278.33 + 0.00
    
    proc_ok = abs(total_proc - expected_proc) < 1.0
    cost_ok = abs(total_cost - expected_cost) < 1.0
    gain_ok = abs(total_gain - expected_gain) < 1.0
    
    print(f"\n  Expected total: Proc=${expected_proc:,.2f}, Cost=${expected_cost:,.2f}, Gain=${expected_gain:,.2f}")
    print(f"  Proceeds match: {'✅' if proc_ok else '❌'}")
    print(f"  Cost match:     {'✅' if cost_ok else '❌'}")
    print(f"  Gain/Loss match:{'✅' if gain_ok else '❌'}")


if __name__ == "__main__":
    main()
