Trong bối cảnh chi phí AI API ngày càng được tối ưu hóa, việc kiểm soát traffic không chỉ là vấn đề bảo mật mà còn ảnh hưởng trực tiếp đến ngân sách vận hành. Bài viết này sẽ hướng dẫn chi tiết cách implement rate limiting với Nginx Lua cho hệ thống AI gateway, đồng thời so sánh chi phí thực tế giữa các nhà cung cấp năm 2026.
Bảng giá AI API 2026 - Dữ liệu đã xác minh
| Model | Output ($/MTok) | Input ($/MTok) | 10M tokens/tháng ($) |
|---|---|---|---|
| GPT-4.1 | $8.00 | $2.00 | $80,000 |
| Claude Sonnet 4.5 | $15.00 | $3.00 | $150,000 |
| Gemini 2.5 Flash | $2.50 | $0.30 | $25,000 |
| DeepSeek V3.2 | $0.42 | $0.14 | $4,200 |
| HolySheep AI | $0.42 | $0.14 | $4,200 |
Với 10 triệu tokens mỗi tháng, chênh lệch giữa Claude Sonnet 4.5 và DeepSeek V3.2 lên đến $145,800. Đây là lý do việc implement rate limiting chặt chẽ không chỉ bảo vệ hệ thống mà còn ngăn chặn việc tiêu tốn ngân sách do lỗi hoặc abuse.
Tại sao Rate Limiting quan trọng cho AI Gateway?
- Kiểm soát chi phí: Mỗi request đều tốn tiền thật. Không có rate limit = không có kiểm soát chi phí
- Chống abuse: Ngăn chặn việc một user/endpoint gửi hàng triệu request vô tội hoặc cố ý
- Đảm bảo QoS: Đảm bảo tất cả users đều có trải nghiệm ổn định
- Bảo vệ backend: Tránh overload khi upstream API có vấn đề
Kiến trúc Rate Limiting với Nginx Lua
Cài đặt OpenResty (Nginx + LuaJIT)
# Ubuntu 22.04
sudo apt update
sudo apt install -y wget gnupg2 ca-certificates lsb-release
Thêm OpenResty repository
wget -qO - https://openresty.org/package/pubkey.gpg | sudo apt-key add -
echo "deb http://openresty.org/package/ubuntu $(lsb_release -sc) main" | \
sudo tee /etc/apt/sources.list.d/openresty.list
sudo apt update
sudo apt install -y openresty
Cài đặt module Redis (cho distributed rate limiting)
sudo apt install -y openresty-opm
sudo opm get openresty/lua-resty-redis
Rate Limiter Module (lua-resty-limit-traffic)
-- /etc/openresty/rate_limiter.lua
-- Module rate limiting cho AI API Gateway
local resty_lock = require "resty.lock"
local redis = require "resty.redis"
local cjson = require "cjson"
local _M = {}
-- Cấu hình mặc định
local DEFAULT_CONFIG = {
-- Giới hạn theo API key
key_limit_per_key = 100, -- requests/second
key_burst_per_key = 20, -- burst allowance
-- Giới hạn theo IP
ip_limit = 50, -- requests/second
ip_burst = 10,
-- Giới hạn theo endpoint
endpoint_limits = {
["/v1/chat/completions"] = { limit = 50, burst = 10 },
["/v1/completions"] = { limit = 30, burst = 5 },
["/v1/embeddings"] = { limit = 100, burst = 50 },
["/v1/images/generations"] = { limit = 5, burst = 2 },
},
-- Giới hạn chi phí ($/phút)
cost_limit_per_minute = 10, -- $10/phút cho mỗi key
-- Redis config
redis_host = "127.0.0.1",
redis_port = 6379,
redis_password = "",
redis_db = 1,
}
-- Khởi tạo Redis connection
local function get_redis_conn(config)
local red = redis:new()
red:set_timeout(1000)
local ok, err = red:connect(config.redis_host, config.redis_port)
if not ok then
return nil, "Redis connect failed: " .. err
end
if config.redis_password and config.redis_password ~= "" then
local ok, err = red:auth(config.redis_password)
if not ok then
return nil, "Redis auth failed: " .. err
end
end
return red
end
-- Kiểm tra rate limit theo token bucket
local function check_rate_limit(red, key, limit, burst, window)
local key_count = "ratelimit:" .. key
local count, err = red:incr(key_count)
if not count then
return nil, "Redis incr failed: " .. err
end
if count == 1 then
red:expire(key_count, window)
end
-- Token bucket: cho phép burst
local allowed = count <= (limit + burst)
return {
allowed = allowed,
remaining = math.max(0, limit + burst - count),
limit = limit + burst,
reset = window
}, nil
end
-- Kiểm tra rate limit theo sliding window
local function check_sliding_window(red, key, limit, window)
local now = ngx.now() * 1000
local window_start = now - (window * 1000)
local key_prefix = "sliding:" .. key
-- Remove expired entries
red:zremrangebyscore(key_prefix, 0, window_start)
-- Count current requests
local count, err = red:zcard(key_prefix)
if not count then
return nil, "Redis zcard failed: " .. err
end
if count < limit then
red:zadd(key_prefix, now)
red:expire(key_prefix, window + 1)
return { allowed = true, remaining = limit - count - 1, limit = limit }, nil
end
return {
allowed = false,
remaining = 0,
limit = limit,
retry_after = window
}, nil
end
-- Kiểm tra giới hạn chi phí
local function check_cost_limit(red, api_key, cost_per_request, limit_per_minute)
local key = "cost:" .. api_key
local now = ngx.now()
local window_start = math.floor(now / 60) * 60
-- Get current cost in window
local cost, err = red:get(key)
if not cost then
cost = 0
else
cost = tonumber(cost) or 0
end
local new_cost = cost + cost_per_request
if new_cost > limit_per_minute then
return false, new_cost
end
-- Update cost
red:set(key, new_cost)
red:expire(key, 120) -- 2 minutes TTL
return true, new_cost
end
-- Hàm chính xử lý request
function _M.check(config, ctx)
config = config or DEFAULT_CONFIG
ctx = ctx or {}
local red, err = get_redis_conn(config)
if not red then
ngx.log(ngx.ERR, "Rate limiter Redis error: ", err)
-- Fail open để không block service
return true, nil
end
local api_key = ngx.var.http_x_api_key or ngx.var.http_authorization or ""
api_key = api_key:gsub("^Bearer%s+", "")
local client_ip = ngx.var.remote_addr
local request_uri = ngx.var.request_uri
-- 1. Kiểm tra limit theo API key (sliding window)
local result, err = check_sliding_window(
red,
"key:" .. api_key,
config.key_limit_per_key,
1 -- 1 second window
)
if err then
ngx.log(ngx.ERR, "Key rate limit check failed: ", err)
elseif not result.allowed then
ngx.header["X-RateLimit-Limit"] = result.limit
ngx.header["X-RateLimit-Remaining"] = 0
ngx.header["Retry-After"] = result.retry_after or 1
ngx.status = 429
ngx.say(cjson.encode({
error = {
message = "Rate limit exceeded for API key",
type = "rate_limit_error",
code = "RATE_LIMIT_KEY"
}
}))
return false, "KEY_LIMIT_EXCEEDED"
end
-- 2. Kiểm tra limit theo IP
result, err = check_sliding_window(
red,
"ip:" .. client_ip,
config.ip_limit,
1
)
if err then
ngx.log(ngx.ERR, "IP rate limit check failed: ", err)
elseif not result.allowed then
ngx.status = 429
ngx.header["X-RateLimit-Limit"] = result.limit
ngx.header["X-RateLimit-Remaining"] = 0
ngx.say(cjson.encode({
error = {
message = "Rate limit exceeded for IP",
type = "rate_limit_error",
code = "RATE_LIMIT_IP"
}
}))
return false, "IP_LIMIT_EXCEEDED"
end
-- 3. Kiểm tra limit theo endpoint
local endpoint_config = config.endpoint_limits[request_uri]
if endpoint_config then
result, err = check_sliding_window(
red,
"endpoint:" .. request_uri .. ":" .. api_key,
endpoint_config.limit,
1
)
if err then
ngx.log(ngx.ERR, "Endpoint rate limit check failed: ", err)
elseif not result.allowed then
ngx.status = 429
ngx.say(cjson.encode({
error = {
message = "Rate limit exceeded for this endpoint",
type = "rate_limit_error",
code = "RATE_LIMIT_ENDPOINT"
}
}))
return false, "ENDPOINT_LIMIT_EXCEEDED"
end
end
-- 4. Kiểm tra cost limit (ước tính)
-- Trong thực tế, cost được tính sau khi nhận response
local estimated_cost = 0.000001 -- 1 request nhỏ
local cost_ok, current_cost = check_cost_limit(red, api_key, estimated_cost, config.cost_limit_per_minute)
if not cost_ok then
ngx.status = 429
ngx.say(cjson.encode({
error = {
message = "Cost limit exceeded. Current: $" .. string.format("%.4f", current_cost) .. "/min, Limit: $" .. config.cost_limit_per_minute .. "/min",
type = "cost_limit_error",
code = "COST_LIMIT_EXCEEDED"
}
}))
return false, "COST_LIMIT_EXCEEDED"
end
-- Set headers
ngx.header["X-RateLimit-Limit"] = config.key_limit_per_key
ngx.header["X-RateLimit-Remaining"] = result and result.remaining or "unlimited"
-- Log request
ngx.log(ngx.INFO, string.format(
"[RATE_LIMIT] key=%s ip=%s uri=%s cost=$%.6f",
api_key:sub(1, 8) .. "...", client_ip, request_uri, current_cost
))
red:set_keepalive(1000, 100)
return true, nil
end
return _M
OpenResty Nginx Configuration
# /etc/openresty/nginx.conf
worker_processes auto;
error_log /var/log/openresty/error.log warn;
pid /var/run/openresty.pid;
events {
worker_connections 10240;
use epoll;
}
http {
include /etc/openresty/mime.types;
default_type application/json;
# Logging
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'rt=$request_time uct=$upstream_connect_time '
'uht=$upstream_header_time urt=$upstream_response_time';
access_log /var/log/openresty/access.log main;
# Buffer settings
client_body_buffer_size 16k;
client_header_buffer_size 1k;
large_client_header_buffers 4 16k;
# Timeouts
client_body_timeout 30s;
client_header_timeout 30s;
send_timeout 60s;
# Proxy settings
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 120s;
# Shared memory cho rate limiting local (fallback)
lua_shared_dict ratelimit 10m;
lua_shared_dict costs 10m;
init_by_lua_block {
package.path = "/etc/openresty/?.lua;/usr/local/openresty/lua-resty-redis/lib/?.lua;" .. package.path
local rate_limiter = require "rate_limiter"
package.loaded.rate_limiter = rate_limiter
}
server {
listen 80;
listen [::]:80;
server_name _;
location / {
default_type text/plain;
return 200 "AI Gateway Running\n";
}
# Proxy endpoint - AI API Gateway
location ~ ^/v1/(chat/completions|completions|embeddings|images/generations) {
# Rate limiting check
access_by_lua_block {
local rate_limiter = package.loaded.rate_limiter
local config = {
key_limit_per_key = 100,
ip_limit = 50,
cost_limit_per_minute = 10,
redis_host = "127.0.0.1",
redis_port = 6379,
redis_password = "",
redis_db = 1,
endpoint_limits = {
["/v1/chat/completions"] = { limit = 50, burst = 10 },
["/v1/embeddings"] = { limit = 100, burst = 50 },
}
}
local ok, err = rate_limiter.check(config)
if not ok then
ngx.log(ngx.WARN, "Rate limit exceeded: ", err)
return
end
}
# Request logging
log_by_lua_block {
local api_key = ngx.var.http_x_api_key or ""
local cost = ngx.var.upstream_response_time or 0
ngx.ctx.request_cost = cost
}
# Proxy to HolySheep AI
set $upstream_host "api.holysheep.ai";
set $upstream_url "https://api.holysheep.ai$request_uri";
proxy_pass $upstream_url;
proxy_http_version 1.1;
proxy_set_header Host $upstream_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Preserve API key
proxy_set_header Authorization $http_authorization;
proxy_set_header X-API-Key $http_x_api_key;
# Headers
proxy_set_header Content-Type "application/json";
# Response handling
proxy_buffering on;
proxy_buffer_size 4k;
proxy_buffers 8 4k;
# Timeout cho AI responses
proxy_read_timeout 180s;
proxy_send_timeout 30s;
}
# Health check endpoint
location /health {
access_log off;
default_type application/json;
set $redis_status "unknown";
set $upstream_status "unknown";
# Check Redis
access_by_lua_block {
local redis = require "resty.redis"
local red = redis:new()
red:set_timeout(500)
local ok, err = red:connect("127.0.0.1", 6379)
if ok then
ngx.var.redis_status = "healthy"
red:set_keepalive(100, 10)
else
ngx.var.redis_status = "unhealthy: " .. (err or "unknown")
end
}
content_by_lua_block {
local status = {
status = "ok",
timestamp = ngx.now(),
redis = ngx.var.redis_status,
nginx_version = ngx.version,
connections = {
active = ngx.var.connection_active,
reading = ngx.var.connection_reading,
writing = ngx.var.connection_writing,
waiting = ngx.var.connection_waiting
}
}
ngx.say(require("cjson").encode(status))
}
}
# Metrics endpoint cho Prometheus
location /metrics {
access_log off;
content_by_lua_block {
local redis = require "resty.redis"
local red = redis:new()
red:set_timeout(500)
local metrics = {}
local total_requests = 0
-- Try to get stats from Redis
if red:connect("127.0.0.1", 6379) then
local keys = red:keys("ratelimit:*")
if keys then
for i, key in ipairs(keys) do
local count = red:get(key)
if count then
total_requests = total_requests + tonumber(count)
end
end
end
red:set_keepalive(100, 10)
end
ngx.say("# HELP ai_gateway_requests_total Total requests\n")
ngx.say("# TYPE ai_gateway_requests_total counter\n")
ngx.say("ai_gateway_requests_total " .. total_requests .. "\n")
ngx.say("# HELP ai_gateway_up Up status\n")
ngx.say("# TYPE ai_gateway_up gauge\n")
ngx.say("ai_gateway_up 1\n")
}
}
# Error handling
error_page 500 502 503 504 /50x.html;
location = /50x.html {
default_type application/json;
content_by_lua_block {
ngx.say(require("cjson").encode({
error = {
message = "Internal server error",
type = "server_error"
}
}))
}
}
}
}
Cost Tracking Module - Theo dõi chi phí thực tế
-- /etc/openresty/cost_tracker.lua
-- Module theo dõi chi phí token thực tế
local redis = require "resty.redis"
local cjson = require "cjson"
local _M = {}
-- Bảng giá mẫu (2026)
local PRICING = {
["gpt-4.1"] = { input = 2.00, output = 8.00 }, -- $/MTok
["claude-sonnet-4.5"] = { input = 3.00, output = 15.00 },
["gemini-2.5-flash"] = { input = 0.30, output = 2.50 },
["deepseek-v3.2"] = { input = 0.14, output = 0.42 },
}
local function get_redis()
local red = redis:new()
red:set_timeout(1000)
local ok, err = red:connect("127.0.0.1", 6379)
if not ok then
return nil, err
end
return red
end
-- Parse response để lấy token count
local function parse_tokens(response_body)
if not response_body then
return 0, 0
end
local ok, result = pcall(cjson.decode, response_body)
if not ok then
return 0, 0
end
local prompt_tokens = result.usage and result.usage.prompt_tokens or 0
local completion_tokens = result.usage and result.usage.completion_tokens or 0
local total_tokens = result.usage and result.usage.total_tokens or (prompt_tokens + completion_tokens)
return prompt_tokens, completion_tokens, total_tokens
end
-- Lấy model từ request body
local function extract_model(request_body)
if not request_body then
return "unknown"
end
local ok, body = pcall(cjson.decode, request_body)
if ok and body.model then
return body.model
end
return "unknown"
end
-- Tính chi phí request
local function calculate_cost(model, prompt_tokens, completion_tokens)
local pricing = PRICING[model] or PRICING["deepseek-v3.2"] -- Default to cheapest
local prompt_cost = (prompt_tokens / 1000000) * pricing.input
local completion_cost = (completion_tokens / 1000000) * pricing.output
return prompt_cost + completion_cost
end
-- Log chi phí vào Redis
function _M.log_cost(api_key, model, prompt_tokens, completion_tokens, cost)
local red, err = get_redis()
if not red then
ngx.log(ngx.ERR, "Cost tracker Redis error: ", err)
return
end
local now = ngx.now()
local minute_key = "cost:" .. api_key .. ":" .. math.floor(now / 60)
local day_key = "cost:" .. api_key .. ":" .. os.date("%Y-%m-%d")
-- Increment counters
red:incrbyfloat(minute_key, cost)
red:expire(minute_key, 3600)
red:incrbyfloat(day_key, cost)
red:expire(day_key, 86400 * 30)
-- Hash for details
local hash_key = "cost_detail:" .. api_key .. ":" .. math.floor(now / 60)
red:hincrby(hash_key, "requests", 1)
red:hincrby(hash_key, "prompt_tokens", prompt_tokens)
red:hincrby(hash_key, "completion_tokens", completion_tokens)
red:hincrbyfloat(hash_key, "cost", cost)
red:expire(hash_key, 3600)
-- Alert if cost exceeds threshold
local current_minute_cost = tonumber(red:get(minute_key)) or 0
if current_minute_cost > 5 then -- $5/phút threshold
ngx.log(ngx.WARN, string.format(
"[COST_ALERT] API Key %s exceeded $%.2f/min (threshold: $5.00)",
api_key:sub(1, 8) .. "...",
current_minute_cost
))
end
red:set_keepalive(1000, 100)
end
-- Response filter - chạy sau khi nhận response từ upstream
function _M.filter()
-- Chỉ chạy cho AI endpoints
local uri = ngx.var.uri
if not uri:match("^/v1/") then
return
end
-- Đọc response body
local resp_body = ngx.arg[1]
local cost = 0
if resp_body and resp_body ~= "" then
local api_key = ngx.var.http_x_api_key or ""
api_key = api_key:gsub("^Bearer%s+", "")
local request_body = ngx.var.request_body or "{}"
local model = extract_model(request_body)
local prompt_tokens, completion_tokens, total_tokens = parse_tokens(resp_body)
if total_tokens > 0 then
cost = calculate_cost(model, prompt_tokens, completion_tokens)
-- Log chi phí
log_cost(api_key, model, prompt_tokens, completion_tokens, cost)
-- Log info
ngx.log(ngx.INFO, string.format(
"[COST] key=%s model=%s tokens=%d cost=$%.6f",
api_key:sub(1, 8) .. "...",
model,
total_tokens,
cost
))
end
end
ngx.arg[2] = true -- Đánh dấu đã xử lý
end
return _M
Script Test Rate Limiting
#!/bin/bash
test_rate_limit.sh - Test script cho rate limiter
GATEWAY_URL="http://localhost"
API_KEY="test-api-key-12345"
echo "=== Testing Rate Limiting ==="
echo ""
Test 1: Single request
echo "[Test 1] Single request:"
curl -s -X POST "${GATEWAY_URL}/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "X-API-Key: ${API_KEY}" \
-d '{"model":"deepseek-v3.2","messages":[{"role":"user","content":"Hello"}]}' \
| jq -r '.error.message // .choices[0].message.content // "OK"' 2>/dev/null || echo "Response received"
echo ""
echo "[Test 2] Rapid requests (should trigger rate limit):"
success=0
rate_limited=0
for i in {1..150}; do
response=$(curl -s -w "\n%{http_code}" -X POST "${GATEWAY_URL}/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "X-API-Key: ${API_KEY}" \
-d '{"model":"deepseek-v3.2","messages":[{"role":"user","content":"Test"}]}' 2>/dev/null)
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" = "200" ]; then
((success++))
elif [ "$http_code" = "429" ]; then
((rate_limited++))
echo " Request $i: Rate limited (429)"
fi
# Small delay to avoid overwhelming
sleep 0.01
done
echo ""
echo "=== Results ==="
echo "Success: $success"
echo "Rate Limited: $rate_limited"
echo ""
Test 3: Check headers
echo "[Test 3] Check rate limit headers:"
curl -I -X POST "${GATEWAY_URL}/v1/chat/completions" \
-H "X-API-Key: ${API_KEY}" \
-H "Content-Type: application/json" \
-d '{"model":"deepseek-v3.2","messages":[]}' 2>/dev/null | grep -i "ratelimit\|x-rate"
echo ""
echo "[Test 4] Health check:"
curl -s "${GATEWAY_URL}/health" | jq '.'
echo ""
echo "[Test 5] Metrics:"
curl -s "${GATEWAY_URL}/metrics" | head -20
Docker Compose - Triển khai nhanh
version: '3.8'
services:
openresty:
image: openresty/openresty:alpine
container_name: ai-gateway
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/openresty/nginx.conf:ro
- ./rate_limiter.lua:/etc/openresty/rate_limiter.lua:ro
- ./cost_tracker.lua:/etc/openresty/cost_tracker.lua:ro
- ./lua-resty-redis:/usr/local/openresty/lua-resty-redis
environment:
- LUA_PATH="/etc/openresty/?.lua;/usr/local/openresty/lua-resty-redis/lib/?.lua;;"
depends_on:
- redis
networks:
- ai-network
restart: unless-stopped
healthcheck:
test: ["cmd", "curl", "-f", "http://localhost/health"]
interval: 30s
timeout: 10s
retries: 3
redis:
image: redis:7-alpine
container_name: ai-redis
ports:
- "6379:6379"
volumes:
- redis-data:/data
command: redis-server --appendonly yes --maxmemory 512mb --maxmemory-policy allkeys-lru
networks:
- ai-network
restart: unless-stopped
prometheus:
image: prom/prometheus:latest
container_name: ai-prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
networks:
- ai-network
restart: unless-stopped
networks:
ai-network:
driver: bridge
volumes:
redis-data:
Lỗi thường gặp và cách khắc phục
| Lỗi | Nguyên nhân | Giải pháp |
|---|---|---|
| lua-resty-redis: failed to load Error: module 'resty.redis' not found |
Module Redis chưa được cài đặt hoặc LUA_PATH chưa đúng |
|
| Redis connection refused connect() failed |
Redis chưa chạy hoặc port không đúng |
|
| Rate limit không hoạt động Tất cả requests đều pass |
access_by_lua_block return sai hoặc logic ngược |
|
| CORS Error khi test No 'Access-Control-Allow-Origin' |
Thiếu CORS headers |
|
| 502 Bad Gateway Upstream prematurely closed connection |
Timeout quá ngắn hoặc upstream quá tải |
|
| Cost tracking không chính xác Chi phí không khớp |
Response chưa được parse đúng |
|
So sánh giải pháp AI Gateway
| Tiêu chí | Nginx + Lua (Self-hosted) | Cloudflare AI Gateway | API7/APISIX | HolySheep AI |
|---|---|---|---|---|
| Chi phí hàng tháng | $50-200 (server + Redis) | Miễn phí tier | $500+/tháng
Tài nguyên liên quanBài viết liên quan🔥 Thử HolySheep AICổng AI API trực tiếp. Hỗ trợ Claude, GPT-5, Gemini, DeepSeek — một khóa, không cần VPN. |