เมื่อคุณ deploy LLM API ขึ้น production แล้วพบว่า request แรกทำงานได้ดี แต่พอมี并发 50-100 connections พร้อมกัน ระบบกลับ timeout หมด นั่นคือจุดที่คุณต้องทำ load testing ก่อนเปิดให้บริการจริง
บทความนี้จะสอนวิธีใช้ Locust และ k6 ในการ压测 AI API ตั้งแต่พื้นฐานจนถึง advanced scenarios โดยใช้ HolySheep AI เป็น endpoint ตัวอย่าง (base_url: https://api.holysheep.ai/v1)
ทำไมต้อง Load Test AI API?
AI API มีความแตกต่างจาก REST API ทั่วไป:
- Latency สูง — LLM inference ใช้เวลา 1-30 วินาทีต่อ request
- Resource intensive — GPU VRAM, compute มีจำกัด
- Concurrent limits — provider กำหนด max concurrent requests
- Token-based pricing — ต้องควบคุม input/output tokens
เครื่องมือที่ใช้
1. Locust (Python-based)
Locust เป็น open-source load testing tool เขียนด้วย Python รองรับ distributed testing ได้ง่าย มี web UI สำหรับ monitor แบบ real-time
2. k6 (Go-based)
k6 เร็วกว่า Locust มาก รองรับ JavaScript/TypeScript scripting และมี built-in integrations กับ cloud platforms หลายตัว
ติดตั้งและเตรียม Environment
# ติดตั้ง Locust
pip install locust
ติดตั้ง k6 (macOS)
brew install k6
ติดตั้ง k6 (Ubuntu/Debian)
sudo gpg -k
sudo gpg --no-default-keyring --keyring /usr/share/keyrings/k6-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys C5AD17C747E3415A3642D57D77C6C491D6AC1D69
echo "deb [signed-by=/usr/share/keyrings/k6-archive-keyring.gpg] https://dl.k6.io/deb stable main" | sudo tee /etc/apt/sources.list.d/k6.list
sudo apt-get update
sudo apt-get install k6
# ตรวจสอบการติดตั้ง
locust --version
k6 version
สร้าง virtual environment สำหรับโปรเจกต์
python -m venv load-test-env
source load-test-env/bin/activate # Linux/Mac
load-test-env\Scripts\activate # Windows
Basic Locust Script สำหรับ AI API
from locust import HttpUser, task, between
import json
class AIAbstractUser(HttpUser):
wait_time = between(1, 3)
def on_start(self):
"""เรียกครั้งเดียวตอนเริ่ม user simulation"""
self.headers = {
"Authorization": f"Bearer {self.environment.host_config.api_key}",
"Content-Type": "application/json"
}
self.model = "gpt-4.1"
@task(3)
def chat_completion_stream(self):
"""ทดสอบ streaming chat completion"""
payload = {
"model": self.model,
"messages": [
{"role": "user", "content": "Explain quantum computing in 50 words"}
],
"max_tokens": 150,
"temperature": 0.7
}
with self.client.post(
"/chat/completions",
headers=self.headers,
json=payload,
catch_response=True,
stream=True,
name="Chat Completion Streaming"
) as response:
if response.status_code == 200:
response.success()
elif response.status_code == 429:
response.failure("Rate limited - backoff triggered")
else:
response.failure(f"HTTP {response.status_code}")
@task(1)
def embedding_request(self):
"""ทดสอบ embedding endpoint"""
payload = {
"model": "text-embedding-3-small",
"input": "Sample text for embedding generation"
}
with self.client.post(
"/embeddings",
headers=self.headers,
json=payload,
catch_response=True
) as response:
if response.elapsed.total_seconds() > 2.0:
response.failure("Embedding latency > 2s")
elif response.status_code == 200:
response.success()
Advanced Locust Script: Concurrent Users + Rate Limiting
from locust import HttpUser, task, between, events
import json
import time
import random
class ProductionLoadUser(HttpUser):
wait_time = between(0.5, 2.0)
def on_start(self):
self.headers = {
"Authorization": f"Bearer {self.environment.host_config.api_key}",
"Content-Type": "application/json"
}
self.conversation_history = []
self.request_count = 0
@task(5)
def multi_turn_conversation(self):
"""จำลอง multi-turn conversation ที่สมจริง"""
questions = [
"What is the capital of Thailand?",
"How far is it from Bangkok to Chiang Mai?",
"What is the weather usually like in Chiang Mai?",
"Suggest a 3-day itinerary for Chiang Mai.",
"What Thai food should I try there?"
]
user_message = random.choice(questions)
self.conversation_history.append({"role": "user", "content": user_message})
payload = {
"model": "claude-sonnet-4.5",
"messages": self.conversation_history.copy(),
"max_tokens": 500,
"temperature": 0.7
}
start_time = time.time()
with self.client.post(
"/chat/completions",
headers=self.headers,
json=payload,
catch_response=True,
stream=False
) as response:
elapsed = time.time() - start_time
if response.status_code == 200:
data = response.json()
if "choices" in data and len(data["choices"]) > 0:
assistant_msg = data["choices"][0]["message"]
self.conversation_history.append(assistant_msg)
# ตรวจสอบ latency SLA
if elapsed < 3.0:
response.success()
else:
response.failure(f"Latency {elapsed:.2f}s exceeds 3s SLA")
# Track usage
usage = data.get("usage", {})
self.environment.events.request.fire(
request_type="POST",
name="chat_completion_with_usage",
response_time=elapsed * 1000, # ms
response_length=len(response.content),
context={
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0)
}
)
elif response.status_code == 429:
# Rate limited - retry with exponential backoff
retry_after = int(response.headers.get("Retry-After", 5))
time.sleep(retry_after)
response.failure("Rate limited - retried")
elif response.status_code == 401:
response.failure("Invalid API key")
else:
response.failure(f"Error: {response.status_code}")
@task(2)
def batch_embedding(self):
"""ทดสอบ batch processing"""
texts = [f"Sample document {i} for embedding" for i in range(10)]
payload = {
"model": "text-embedding-3-small",
"input": texts
}
start = time.time()
with self.client.post(
"/embeddings",
headers=self.headers,
json=payload,
catch_response=True
) as response:
if response.status_code == 200:
elapsed = time.time() - start
if elapsed < 5.0:
response.success()
else:
response.failure(f"Batch embedding took {elapsed:.2f}s")
else:
response.failure(f"HTTP {response.status_code}")
@events.test_start.add_listener
def on_test_start(environment, **kwargs):
"""เรียกก่อนเริ่ม test"""
print(f"Starting load test with {environment.runner.target_user_count} users")
@events.request.add_listener
def on_request(request_type, name, response_time, response_length, exception, **kwargs):
"""Log ทุก request"""
if exception:
print(f"FAILED: {name} - {exception}")
elif response_time > 3000: # > 3s
print(f"SLOW: {name} - {response_time:.0f}ms")
การ Run Locust Test
# Run Locust with headless mode (no web UI)
จำลอง 100 concurrent users, spawn 10 users/second, run 5 minutes
locust \
--host=https://api.holysheep.ai/v1 \
--users=100 \
--spawn-rate=10 \
--run-time=300s \
--headless \
--html=report.html \
--csv=results \
--only-summary
Run Locust with web UI สำหรับ interactive testing
locust \
--host=https://api.holysheep.ai/v1 \
--port=8089
Run แบบ distributed (master + 2 workers)
Terminal 1: Master
locust -f locustfile.py --master --expect-workers=2
Terminal 2 & 3: Workers
locust -f locustfile.py --worker --master-host=localhost
k6 Script สำหรับ AI API
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend } from 'k6/metrics';
// Custom metrics
const errorRate = new Rate('errors');
const llmLatency = new Trend('llm_latency');
// Configuration
const BASE_URL = 'https://api.holysheep.ai/v1';
const API_KEY = __ENV.HOLYSHEEP_API_KEY || 'YOUR_HOLYSHEEP_API_KEY';
export const options = {
stages: [
{ duration: '30s', target: 10 }, // Ramp up to 10 users
{ duration: '1m', target: 50 }, // Stay at 50 users
{ duration: '30s', target: 100 }, // Spike to 100 users
{ duration: '1m', target: 100 }, // Stay at 100 users
{ duration: '30s', target: 0 }, // Ramp down
],
thresholds: {
'http_req_duration': ['p(95)<5000'], // 95th percentile < 5s
'errors': ['rate<0.05'], // Error rate < 5%
'llm_latency': ['p(99)<10000'], // LLM latency p99 < 10s
},
};
const headers = {
'Authorization': Bearer ${API_KEY},
'Content-Type': 'application/json',
};
export default function () {
// Test 1: Chat Completion
const chatPayload = JSON.stringify({
model: 'gpt-4.1',
messages: [
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: 'List 5 programming languages with one line description each.' }
],
max_tokens: 300,
temperature: 0.7,
});
const chatStart = Date.now();
const chatRes = http.post(${BASE_URL}/chat/completions, chatPayload, {
headers,
tags: { name: 'chat_completion' },
});
const chatDuration = Date.now() - chatStart;
llmLatency.add(chatDuration);
let chatSuccess = false;
if (chatRes.status === 200) {
const data = JSON.parse(chatRes.body);
if (data.choices && data.choices.length > 0) {
chatSuccess = true;
}
} else if (chatRes.status === 429) {
// Rate limited - wait and retry
const retryAfter = parseInt(chatRes.headers['Retry-After']) || 5;
sleep(retryAfter);
}
check(chatRes, {
'chat status 200': (r) => r.status === 200,
'chat has choices': () => chatSuccess,
'chat latency < 5s': () => chatDuration < 5000,
}) || errorRate.add(1);
// Test 2: Embedding
const embedPayload = JSON.stringify({
model: 'text-embedding-3-small',
input: Sample document for embedding generation - iteration ${__ITER},
});
const embedRes = http.post(${BASE_URL}/embeddings, embedPayload, {
headers,
tags: { name: 'embeddings' },
});
check(embedRes, {
'embed status 200': (r) => r.status === 200,
'embed has data': (r) => {
try {
const data = JSON.parse(r.body);
return data.data && data.data.length > 0;
} catch (e) {
return false;
}
},
}) || errorRate.add(1);
// Random think time between requests
sleep(Math.random() * 2 + 0.5);
}
// Export results to JSON
export function handleSummary(data) {
return {
'stdout': textSummary(data, { indent: ' ', enableColors: true }),
'summary.json': JSON.stringify(data, null, 2),
};
}
การ Run k6 Test
# Basic k6 test
k6 run script.js
Run with environment variable
HOLYSHEEP_API_KEY=sk-xxx k6 run script.js
Run with cloud output (需要 k6 cloud account)
k6 run script.js --out cloud
Run with InfluxDB output (for Grafana monitoring)
k6 run script.js \
--out influxdb=http://localhost:8086/k6
Run with JSON output (for CI/CD)
k6 run script.js --out json=results.json
Run specific场景
k6 run script.js --env TEST_SCENARIO=spike
Run with tags for filtering
k6 run script.js -t model=gpt-4.1 -t region=ap-southeast
วิเคราะห์ผลลัพธ์และ KPIs ที่ต้องติดตาม
Key Metrics สำหรับ AI API
- p50/p95/p99 Latency — Latency ที่ยอมรับได้คือ p95 < 5s สำหรับ chat, < 2s สำหรับ embedding
- Error Rate — ควร < 1% รวมถึง timeout, 429, 500 errors
- Requests per Second (RPS) — Throughput สูงสุดที่ระบบรองรับ
- Token Throughput — Tokens ที่ประมวลผลได้ต่อวินาที
- Cost per 1K Requests — คำนวณต้นทุนต่อ request
ประมาณค่าใช้จ่ายจาก Load Test
จากการ test ที่ HolySheep AI ซึ่งมีอัตรา ¥1=$1 (ประหยัด 85%+ จากราคา OpenAI) ราคา 2026/MTok:
# สมมติฐานจาก load test results
Average tokens per request:
- Input: 200 tokens
- Output: 150 tokens
Total: 350 tokens/request
10,000 requests ต่อวัน
DAILY_REQUESTS=10000
TOKENS_PER_REQUEST=350
คำนวณค่าใช้จ่ายรายเดือน
python3 << 'EOF'
import math
DAILY_REQUESTS = 10000
TOKENS_PER_REQUEST = 350
DAYS_PER_MONTH = 30
Pricing from HolySheep AI (2026)
MODELS = {
"gpt-4.1": 8.0, # $8/MTok
"claude-sonnet-4.5": 15.0, # $15/MTok
"gemini-2.5-flash": 2.50, # $2.50/MTok
"deepseek-v3.2": 0.42, # $0.42/MTok
}
daily_tokens = DAILY_REQUESTS * TOKENS_PER_REQUEST
monthly_tokens = daily_tokens * DAYS_PER_MONTH / 1_000_000 # in millions
print(f"Monthly tokens: {monthly_tokens:.2f} M")
print("-" * 50)
for model, price_per_mtok in MODELS.items():
cost = monthly_tokens * price_per_mtok
print(f"{model}: ${cost:.2f}/month ({price_per_mtok}/MTok)")
Compare with OpenAI
openai_gpt4 = 15.0 # OpenAI GPT-4: $15/MTok input, $60/MTok output
holysheep_gpt4 = MODELS["gpt-4.1"]
savings = ((openai_gpt4 - holysheep_gpt4) / openai_gpt4) * 100
print("-" * 50)
print(f"GPT-4.1 savings vs