#!/usr/bin/env python3
"""
Download and convert the custom all-MiniLM-L6-v2 model to ONNX format.

This script:
1. Downloads the model from HuggingFace (jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3)
2. Converts it to ONNX format using optimum library
3. Saves the ONNX model to the Models directory

Requirements:
    pip install optimum[exporters] transformers torch

Usage:
    python download-convert-model.py
"""

import os
import subprocess
import sys

def check_requirements():
    """Check if required packages are installed."""
    required_packages = [
        ("optimum", "optimum[exporters]"),
        ("transformers", "transformers"),
        ("torch", "torch"),
    ]
    
    missing = []
    for package_name, install_command in required_packages:
        try:
            __import__(package_name.replace("-", "_"))
            print(f"[OK] {package_name} is installed")
        except ImportError:
            missing.append((package_name, install_command))
    
    if missing:
        print("\nMissing required packages. Installing...")
        for package_name, install_command in missing:
            print(f"Installing {package_name}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", install_command])
    
    return True

def download_and_convert_model():
    """Download model from HuggingFace and convert to ONNX."""
    from optimum.onnxruntime import ORTModelForFeatureExtraction
    from transformers import AutoTokenizer
    
    model_id = "jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3"
    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "custom-model")
    
    print(f"\n{'='*60}")
    print(f"Downloading model: {model_id}")
    print(f"Output directory: {output_dir}")
    print(f"{'='*60}\n")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        print("Downloading model and tokenizer from HuggingFace...")
        print("This may take a few minutes on first run...\n")
        
        # Download model and tokenizer, then export to ONNX
        model = ORTModelForFeatureExtraction.from_pretrained(
            model_id,
            export=True,
            provider="CPUExecutionProvider",
        )
        
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        # Save ONNX model
        print(f"Saving ONNX model to {output_dir}...")
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        print("\n[OK] Model successfully converted to ONNX format!")
        print(f"\nOutput files:")
        for f in os.listdir(output_dir):
            filepath = os.path.join(output_dir, f)
            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            print(f"  - {f} ({size_mb:.2f} MB)")

        # Copy the main model file to a simpler location for easy access
        main_model_file = os.path.join(output_dir, "model.onnx")
        if os.path.exists(main_model_file):
            simple_output = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "address-embedding-model.onnx")
            import shutil
            shutil.copy(main_model_file, simple_output)
            print(f"\n[OK] Copied model to: {simple_output}")
        
        return True
        
    except Exception as e:
        print(f"\n[ERROR] {e}")
        import traceback
        traceback.print_exc()
        return False

def create_powershell_download_script():
    """Create a PowerShell script for Windows users who can't run Python."""
    ps_script = '''#!/usr/bin/env pwsh
# Download script for custom address embedding model (ONNX format)
# This script downloads a pre-converted ONNX model if available

$ModelRepo = "jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3"
$OutputPath = "Models/address-embedding-model.onnx"

Write-Host "Attempting to download pre-converted ONNX model..." -ForegroundColor Cyan

# Try to download from HuggingFace Hub (ONNX format if available)
$OnnxUrl = "https://huggingface.co/$ModelRepo/resolve/main/onnx/model.onnx"

try {
    $ProgressPreference = 'SilentlyContinue'
    
    if (Get-Command curl -ErrorAction SilentlyContinue) {
        curl -L -o $OutputPath $OnnxUrl --fail
    } else {
        Invoke-WebRequest -Uri $OnnxUrl -OutFile $OutputPath -UseBasicParsing
    }
    
    if (Test-Path $OutputPath) {
        $size = (Get-Item $OutputPath).Length / 1MB
        Write-Host "Successfully downloaded model to $OutputPath" -ForegroundColor Green
        Write-Host "File size: $([math]::Round($size, 2)) MB" -ForegroundColor Gray
    }
}
catch {
    Write-Host "Could not download pre-converted ONNX model." -ForegroundColor Yellow
    Write-Host ""
    Write-Host "The model does not have a pre-converted ONNX format." -ForegroundColor White
    Write-Host "Please run the Python conversion script instead:" -ForegroundColor White
    Write-Host "  python download-convert-model.py" -ForegroundColor Gray
    exit 1
}
'''
    
    script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "download-custom-model.ps1")
    with open(script_path, "w") as f:
        f.write(ps_script)
    print(f"\n[OK] Created PowerShell fallback script: {script_path}")

def main():
    print("\n" + "="*60)
    print("Custom Address Embedding Model - ONNX Converter")
    print("="*60)
    
    # Check if we should just create the fallback script
    if len(sys.argv) > 1 and sys.argv[1] == "--check-only":
        check_requirements()
        return 0
    
    # Check requirements first
    print("Checking requirements...")
    check_requirements()
    
    # Download and convert
    if download_and_convert_model():
        # Create fallback PowerShell script
        create_powershell_download_script()
        print("\n" + "="*60)
        print("Conversion complete!")
        print("="*60)
        print("\nNext steps:")
        print("1. Update appsettings.json to use the new model:")
        print('   "Embedding": { "ModelName": "custom-all-MiniLM-L6-v2-address" }')
        print("2. Update EmbeddingService.cs to support the new model path")
        print("3. Run the application")
        return 0
    else:
        return 1

if __name__ == "__main__":
    sys.exit(main())