Skip to content

Ollama Local Model Integration

Ollama Local Model Integration

The Rizk SDK provides seamless integration with Ollama for running local LLM models, automatically adding observability, guardrails, and policy enforcement to your locally-hosted AI applications.

Quick Start

import ollama
from rizk.sdk import Rizk
from rizk.sdk.decorators import workflow, guardrails
# Initialize Rizk SDK
rizk = Rizk.init(app_name="Ollama-App", enabled=True)
@workflow(name="ollama_chat", organization_id="acme", project_id="local_ai")
@guardrails(enforcement_level="strict")
def ollama_completion(user_message: str, model: str = "llama2") -> str:
"""Create an Ollama completion with monitoring and governance."""
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": user_message}]
)
return response['message']['content']
# Usage
result = ollama_completion("Explain quantum computing in simple terms")
print(result)

Installation and Setup

Installing Ollama

Terminal window
# Windows (PowerShell as Administrator)
# Download from https://ollama.ai and run installer
# Linux
curl -fsSL https://ollama.ai/install.sh | sh
# macOS
brew install ollama

Installing Python Client

Terminal window
pip install ollama

Pulling Models

Terminal window
# Pull popular models
ollama pull llama2
ollama pull codellama
ollama pull mistral
ollama pull phi
# List available models
ollama list

Supported Models

@workflow(name="ollama_multi_model")
@guardrails(enforcement_level="moderate")
def ollama_multi_model_completion(message: str, use_case: str = "general") -> dict:
"""Ollama completion with model selection based on use case."""
models = {
"general": "llama2", # General purpose conversations
"code": "codellama", # Code generation and analysis
"math": "mistral", # Mathematical reasoning
"small": "phi", # Lightweight, fast responses
"creative": "llama2:13b", # Creative writing (if available)
"instruct": "llama2:7b-chat" # Instruction following
}
selected_model = models.get(use_case, models["general"])
response = ollama.chat(
model=selected_model,
messages=[{"role": "user", "content": message}]
)
return {
"content": response['message']['content'],
"model": selected_model,
"use_case": use_case
}

Configuration

@workflow(name="configured_ollama")
@guardrails(enforcement_level="moderate")
def configured_ollama_completion(
message: str,
model: str = "llama2",
temperature: float = 0.7,
max_tokens: int = 500
) -> str:
"""Ollama completion with custom configuration."""
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": message}],
options={
"temperature": temperature,
"num_predict": max_tokens,
"top_p": 0.9,
"top_k": 40
}
)
return response['message']['content']

Streaming

@workflow(name="ollama_streaming")
@guardrails(enforcement_level="moderate")
def ollama_streaming_completion(message: str, model: str = "llama2") -> str:
"""Ollama completion with streaming response."""
response_chunks = []
stream = ollama.chat(
model=model,
messages=[{"role": "user", "content": message}],
stream=True
)
for chunk in stream:
content = chunk['message']['content']
response_chunks.append(content)
print(content, end="", flush=True)
print()
return "".join(response_chunks)

Error Handling

import time
from ollama import ResponseError
@workflow(name="robust_ollama")
@guardrails(enforcement_level="moderate")
def robust_ollama_completion(message: str, model: str = "llama2", max_retries: int = 3) -> str:
"""Ollama completion with robust error handling."""
for attempt in range(max_retries):
try:
response = ollama.chat(
model=model,
messages=[{"role": "user", "content": message}]
)
return response['message']['content']
except ResponseError as e:
if "model not found" in str(e).lower():
try:
print(f"Pulling model: {model}")
ollama.pull(model)
continue
except Exception as pull_error:
raise Exception(f"Failed to pull model {model}: {pull_error}")
if attempt < max_retries - 1:
wait_time = 2 ** attempt
time.sleep(wait_time)
continue
raise
except ConnectionError:
if attempt < max_retries - 1:
time.sleep(2)
continue
raise Exception("Could not connect to Ollama server. Is it running?")

Model Management

@workflow(name="ollama_model_management")
def manage_ollama_models() -> dict:
"""Manage Ollama models programmatically."""
models = ollama.list()
def model_exists(model_name: str) -> bool:
return any(model['name'].startswith(model_name) for model in models['models'])
def ensure_model(model_name: str) -> bool:
if not model_exists(model_name):
try:
ollama.pull(model_name)
return True
except Exception as e:
print(f"Failed to pull model {model_name}: {e}")
return False
return True
required_models = ["llama2", "codellama"]
results = {}
for model in required_models:
results[model] = ensure_model(model)
return {
"available_models": [model['name'] for model in models['models']],
"pull_results": results
}

Best Practices

Resource Management

import psutil
@workflow(name="resource_aware_ollama")
def resource_aware_ollama_completion(message: str) -> dict:
"""Ollama completion with resource monitoring."""
cpu_percent = psutil.cpu_percent(interval=1)
memory_percent = psutil.virtual_memory().percent
if cpu_percent > 90 or memory_percent > 85:
return {
"success": False,
"error": "system_overloaded",
"message": f"System resources too high: CPU {cpu_percent}%, Memory {memory_percent}%"
}
response = ollama.chat(
model="llama2",
messages=[{"role": "user", "content": message}]
)
return {
"success": True,
"content": response['message']['content']
}

Server Health Check

import requests
def check_ollama_server() -> bool:
"""Check if Ollama server is running."""
try:
response = requests.get("http://localhost:11434/api/version", timeout=5)
return response.status_code == 200
except:
return False