"""
Automatically consolidate similar project names using smart matching
No hardcoding - uses similarity algorithms
"""
from sqlalchemy import create_engine, text
import os
from difflib import SequenceMatcher

DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:asdf1234@localhost:5432/timesheet")
engine = create_engine(DATABASE_URL)

def similarity(a, b):
    """Calculate similarity between two strings (0-1)"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def normalize_name(name):
    """Normalize project name to canonical form"""
    # Remove common suffixes/variations
    name = name.strip()
    name = name.replace(' (Workspace)', '')
    name = name.replace(' (Working Tree)', '')
    name = name.replace(' (Index)', '')
    name = name.replace('-copy', '')
    name = name.replace('_copy', '')
    return name

print("\n" + "="*80)
print("AUTO-CONSOLIDATING SIMILAR PROJECT NAMES")
print("="*80 + "\n")

with engine.connect() as conn:
    # Get all project names with their stats
    result = conn.execute(text("""
        SELECT
            project_name,
            COUNT(*) as count,
            SUM(duration) / 3600.0 as hours
        FROM activity_records
        WHERE project_name IS NOT NULL
        AND project_name != ''
        AND project_name != 'general'
        GROUP BY project_name
        HAVING COUNT(*) > 1
        ORDER BY hours DESC
    """))

    projects = {}
    for row in result:
        pname, count, hours = row
        normalized = normalize_name(pname)
        projects[pname] = {
            'normalized': normalized,
            'count': count,
            'hours': hours or 0
        }

    print(f"Found {len(projects)} projects")
    print("\n1. Finding similar project name groups...")
    print("-" * 80)

    # Find groups of similar names
    groups = {}
    processed = set()

    for name1 in projects:
        if name1 in processed:
            continue

        # Start a new group
        group = [name1]
        processed.add(name1)

        # Find similar names
        for name2 in projects:
            if name2 in processed:
                continue

            # Check similarity (>0.85 means very similar)
            sim = similarity(projects[name1]['normalized'], projects[name2]['normalized'])

            if sim > 0.85:  # 85% similar
                group.append(name2)
                processed.add(name2)

        # Only create group if there are multiple similar names
        if len(group) > 1:
            # Choose canonical name: the one with most hours
            canonical = max(group, key=lambda x: projects[x]['hours'])
            groups[canonical] = group

    print(f"Found {len(groups)} groups of similar names\n")

    if not groups:
        print("No similar project names found to consolidate.")
        exit(0)

    # Show groups
    print("2. Similar name groups (will consolidate to first name):")
    print("-" * 80)

    consolidation_map = {}
    total_affected = 0

    for canonical, variations in groups.items():
        total_hours = sum(projects[v]['hours'] for v in variations)
        total_count = sum(projects[v]['count'] for v in variations)

        print(f"\nGroup: {canonical}")
        print(f"  Total: {total_count:,} records, {total_hours:.2f}h")

        for var in variations:
            if var != canonical:
                print(f"  <- {var:40} ({projects[var]['count']:,} records, {projects[var]['hours']:.2f}h)")
                consolidation_map[var] = canonical
                total_affected += projects[var]['count']

    # Perform consolidation
    print("\n" + "="*80)
    print(f"Consolidating {len(consolidation_map)} variations...")
    print("-" * 80)

    updated = 0
    for old_name, new_name in consolidation_map.items():
        result = conn.execute(text("""
            UPDATE activity_records
            SET project_name = :new_name
            WHERE project_name = :old_name
        """), {"old_name": old_name, "new_name": new_name})

        if result.rowcount > 0:
            updated += result.rowcount
            print(f"  {old_name} -> {new_name} ({result.rowcount:,} records)")

    conn.commit()

    print(f"\n[OK] Consolidated {updated:,} records")

    # Show top projects after consolidation
    print("\n3. Top 30 projects after consolidation:")
    print("-" * 80)

    result = conn.execute(text("""
        SELECT
            project_name,
            COUNT(*) as count,
            SUM(duration) / 3600.0 as hours
        FROM activity_records
        WHERE project_name != 'general'
        AND project_name IS NOT NULL
        GROUP BY project_name
        ORDER BY hours DESC
        LIMIT 30
    """))

    for row in result:
        pname, count, hours = row
        print(f"{pname:50} {count:>7,} records  {hours:>8.2f}h")

print("\n" + "="*80)
print("COMPLETE!")
print("="*80 + "\n")