Ollama Local Model Integration
Ollama Local Model Integration
The Rizk SDK provides seamless integration with Ollama for running local LLM models, automatically adding observability, guardrails, and policy enforcement to your locally-hosted AI applications.
Quick Start
import ollamafrom rizk.sdk import Rizkfrom rizk.sdk.decorators import workflow, guardrails
# Initialize Rizk SDKrizk = Rizk.init(app_name="Ollama-App", enabled=True)
@workflow(name="ollama_chat", organization_id="acme", project_id="local_ai")@guardrails(enforcement_level="strict")def ollama_completion(user_message: str, model: str = "llama2") -> str: """Create an Ollama completion with monitoring and governance."""
response = ollama.chat( model=model, messages=[{"role": "user", "content": user_message}] )
return response['message']['content']
# Usageresult = ollama_completion("Explain quantum computing in simple terms")print(result)
Installation and Setup
Installing Ollama
# Windows (PowerShell as Administrator)# Download from https://ollama.ai and run installer
# Linuxcurl -fsSL https://ollama.ai/install.sh | sh
# macOSbrew install ollama
Installing Python Client
pip install ollama
Pulling Models
# Pull popular modelsollama pull llama2ollama pull codellamaollama pull mistralollama pull phi
# List available modelsollama list
Supported Models
@workflow(name="ollama_multi_model")@guardrails(enforcement_level="moderate")def ollama_multi_model_completion(message: str, use_case: str = "general") -> dict: """Ollama completion with model selection based on use case."""
models = { "general": "llama2", # General purpose conversations "code": "codellama", # Code generation and analysis "math": "mistral", # Mathematical reasoning "small": "phi", # Lightweight, fast responses "creative": "llama2:13b", # Creative writing (if available) "instruct": "llama2:7b-chat" # Instruction following }
selected_model = models.get(use_case, models["general"])
response = ollama.chat( model=selected_model, messages=[{"role": "user", "content": message}] )
return { "content": response['message']['content'], "model": selected_model, "use_case": use_case }
Configuration
@workflow(name="configured_ollama")@guardrails(enforcement_level="moderate")def configured_ollama_completion( message: str, model: str = "llama2", temperature: float = 0.7, max_tokens: int = 500) -> str: """Ollama completion with custom configuration."""
response = ollama.chat( model=model, messages=[{"role": "user", "content": message}], options={ "temperature": temperature, "num_predict": max_tokens, "top_p": 0.9, "top_k": 40 } )
return response['message']['content']
Streaming
@workflow(name="ollama_streaming")@guardrails(enforcement_level="moderate")def ollama_streaming_completion(message: str, model: str = "llama2") -> str: """Ollama completion with streaming response."""
response_chunks = []
stream = ollama.chat( model=model, messages=[{"role": "user", "content": message}], stream=True )
for chunk in stream: content = chunk['message']['content'] response_chunks.append(content) print(content, end="", flush=True)
print() return "".join(response_chunks)
Error Handling
import timefrom ollama import ResponseError
@workflow(name="robust_ollama")@guardrails(enforcement_level="moderate")def robust_ollama_completion(message: str, model: str = "llama2", max_retries: int = 3) -> str: """Ollama completion with robust error handling."""
for attempt in range(max_retries): try: response = ollama.chat( model=model, messages=[{"role": "user", "content": message}] ) return response['message']['content']
except ResponseError as e: if "model not found" in str(e).lower(): try: print(f"Pulling model: {model}") ollama.pull(model) continue except Exception as pull_error: raise Exception(f"Failed to pull model {model}: {pull_error}")
if attempt < max_retries - 1: wait_time = 2 ** attempt time.sleep(wait_time) continue raise
except ConnectionError: if attempt < max_retries - 1: time.sleep(2) continue raise Exception("Could not connect to Ollama server. Is it running?")
Model Management
@workflow(name="ollama_model_management")def manage_ollama_models() -> dict: """Manage Ollama models programmatically."""
models = ollama.list()
def model_exists(model_name: str) -> bool: return any(model['name'].startswith(model_name) for model in models['models'])
def ensure_model(model_name: str) -> bool: if not model_exists(model_name): try: ollama.pull(model_name) return True except Exception as e: print(f"Failed to pull model {model_name}: {e}") return False return True
required_models = ["llama2", "codellama"] results = {}
for model in required_models: results[model] = ensure_model(model)
return { "available_models": [model['name'] for model in models['models']], "pull_results": results }
Best Practices
Resource Management
import psutil
@workflow(name="resource_aware_ollama")def resource_aware_ollama_completion(message: str) -> dict: """Ollama completion with resource monitoring."""
cpu_percent = psutil.cpu_percent(interval=1) memory_percent = psutil.virtual_memory().percent
if cpu_percent > 90 or memory_percent > 85: return { "success": False, "error": "system_overloaded", "message": f"System resources too high: CPU {cpu_percent}%, Memory {memory_percent}%" }
response = ollama.chat( model="llama2", messages=[{"role": "user", "content": message}] )
return { "success": True, "content": response['message']['content'] }
Server Health Check
import requests
def check_ollama_server() -> bool: """Check if Ollama server is running.""" try: response = requests.get("http://localhost:11434/api/version", timeout=5) return response.status_code == 200 except: return False
Related Documentation
- OpenAI Integration - OpenAI API integration
- Anthropic Integration - Claude API integration
- Custom LLM Providers - Adding new LLM support