Committing too many files but the app runs possibly with a new model.

2026-01-16 23:07:45 -05:00
parent 12fd2ef45e
commit cb4488ee58
14 changed files with 61686 additions and 6 deletions
--- a/VectorSearchApp/Models/download-convert-model.py
+++ b/VectorSearchApp/Models/download-convert-model.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Download and convert the custom all-MiniLM-L6-v2 model to ONNX format.
+
+This script:
+1. Downloads the model from HuggingFace (jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3)
+2. Converts it to ONNX format using optimum library
+3. Saves the ONNX model to the Models directory
+
+Requirements:
+    pip install optimum[exporters] transformers torch
+
+Usage:
+    python download-convert-model.py
+"""
+
+import os
+import subprocess
+import sys
+
+def check_requirements():
+    """Check if required packages are installed."""
+    required_packages = [
+        ("optimum", "optimum[exporters]"),
+        ("transformers", "transformers"),
+        ("torch", "torch"),
+    ]
+    
+    missing = []
+    for package_name, install_command in required_packages:
+        try:
+            __import__(package_name.replace("-", "_"))
+            print(f"[OK] {package_name} is installed")
+        except ImportError:
+            missing.append((package_name, install_command))
+    
+    if missing:
+        print("\nMissing required packages. Installing...")
+        for package_name, install_command in missing:
+            print(f"Installing {package_name}...")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", install_command])
+    
+    return True
+
+def download_and_convert_model():
+    """Download model from HuggingFace and convert to ONNX."""
+    from optimum.onnxruntime import ORTModelForFeatureExtraction
+    from transformers import AutoTokenizer
+    
+    model_id = "jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3"
+    output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "custom-model")
+    
+    print(f"\n{'='*60}")
+    print(f"Downloading model: {model_id}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*60}\n")
+    
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    
+    try:
+        print("Downloading model and tokenizer from HuggingFace...")
+        print("This may take a few minutes on first run...\n")
+        
+        # Download model and tokenizer, then export to ONNX
+        model = ORTModelForFeatureExtraction.from_pretrained(
+            model_id,
+            export=True,
+            provider="CPUExecutionProvider",
+        )
+        
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        
+        # Save ONNX model
+        print(f"Saving ONNX model to {output_dir}...")
+        model.save_pretrained(output_dir)
+        tokenizer.save_pretrained(output_dir)
+        
+        print("\n[OK] Model successfully converted to ONNX format!")
+        print(f"\nOutput files:")
+        for f in os.listdir(output_dir):
+            filepath = os.path.join(output_dir, f)
+            size_mb = os.path.getsize(filepath) / (1024 * 1024)
+            print(f"  - {f} ({size_mb:.2f} MB)")
+
+        # Copy the main model file to a simpler location for easy access
+        main_model_file = os.path.join(output_dir, "model.onnx")
+        if os.path.exists(main_model_file):
+            simple_output = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "address-embedding-model.onnx")
+            import shutil
+            shutil.copy(main_model_file, simple_output)
+            print(f"\n[OK] Copied model to: {simple_output}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"\n[ERROR] {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+def create_powershell_download_script():
+    """Create a PowerShell script for Windows users who can't run Python."""
+    ps_script = '''#!/usr/bin/env pwsh
+# Download script for custom address embedding model (ONNX format)
+# This script downloads a pre-converted ONNX model if available
+
+$ModelRepo = "jarredparrett/all-MiniLM-L6-v2_tuned_on_deepparse_address_mutations_comb_3"
+$OutputPath = "Models/address-embedding-model.onnx"
+
+Write-Host "Attempting to download pre-converted ONNX model..." -ForegroundColor Cyan
+
+# Try to download from HuggingFace Hub (ONNX format if available)
+$OnnxUrl = "https://huggingface.co/$ModelRepo/resolve/main/onnx/model.onnx"
+
+try {
+    $ProgressPreference = 'SilentlyContinue'
+    
+    if (Get-Command curl -ErrorAction SilentlyContinue) {
+        curl -L -o $OutputPath $OnnxUrl --fail
+    } else {
+        Invoke-WebRequest -Uri $OnnxUrl -OutFile $OutputPath -UseBasicParsing
+    }
+    
+    if (Test-Path $OutputPath) {
+        $size = (Get-Item $OutputPath).Length / 1MB
+        Write-Host "Successfully downloaded model to $OutputPath" -ForegroundColor Green
+        Write-Host "File size: $([math]::Round($size, 2)) MB" -ForegroundColor Gray
+    }
+}
+catch {
+    Write-Host "Could not download pre-converted ONNX model." -ForegroundColor Yellow
+    Write-Host ""
+    Write-Host "The model does not have a pre-converted ONNX format." -ForegroundColor White
+    Write-Host "Please run the Python conversion script instead:" -ForegroundColor White
+    Write-Host "  python download-convert-model.py" -ForegroundColor Gray
+    exit 1
+}
+'''
+    
+    script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "Models", "download-custom-model.ps1")
+    with open(script_path, "w") as f:
+        f.write(ps_script)
+    print(f"\n[OK] Created PowerShell fallback script: {script_path}")
+
+def main():
+    print("\n" + "="*60)
+    print("Custom Address Embedding Model - ONNX Converter")
+    print("="*60)
+    
+    # Check if we should just create the fallback script
+    if len(sys.argv) > 1 and sys.argv[1] == "--check-only":
+        check_requirements()
+        return 0
+    
+    # Check requirements first
+    print("Checking requirements...")
+    check_requirements()
+    
+    # Download and convert
+    if download_and_convert_model():
+        # Create fallback PowerShell script
+        create_powershell_download_script()
+        print("\n" + "="*60)
+        print("Conversion complete!")
+        print("="*60)
+        print("\nNext steps:")
+        print("1. Update appsettings.json to use the new model:")
+        print('   "Embedding": { "ModelName": "custom-all-MiniLM-L6-v2-address" }')
+        print("2. Update EmbeddingService.cs to support the new model path")
+        print("3. Run the application")
+        return 0
+    else:
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())