Graceful Shutdown AI Inference Strategy: ConnectionErrorと戦う実践的アプローチ

本番環境のAI推論システムで、突然のプロセス停止による「ConnectionError: timeout」「500 Internal Server Error」は致命的な障害となり得ます。

本稿では、HolySheep AI APIを活用した堅牢なグレースフルシャットダウン戦略を、筆者が実際に直面した障害を例に説明します。

問題の背景：突然のシャットダウンによる503エラー

私の本番環境では、Kubernetesポッドのスケールダウン時に、未完了の推論リクエストが途中で切断され、最大40秒間の502 Bad Gatewayエラーを記録していました。HolySheep AIの<50msレイテンシを活かす에도、接続管理の甘さが足を引っ張っていたのです。

基本的なグレースフルシャットダウン実装

import signal
import time
import threading
from typing import Optional
from openai import OpenAI

HolySheep AI クライアント初期化
client = OpenAI(
    api_key="YOUR_HOLYSHEEP_API_KEY",
    base_url="https://api.holysheep.ai/v1",
    timeout=30.0,
    max_retries=3
)

class GracefulShutdownHandler:
    def __init__(self):
        self.shutdown_event = threading.Event()
        self.active_requests: dict[str, bool] = {}
        self.lock = threading.Lock()
    
    def register_request(self, request_id: str):
        with self.lock:
            self.active_requests[request_id] = True
    
    def complete_request(self, request_id: str):
        with self.lock:
            self.active_requests.pop(request_id, None)
    
    def is_shutting_down(self) -> bool:
        return self.shutdown_event.is_set()
    
    def wait_for_completion(self, timeout: float = 30.0):
        """アクティブなリクエスト完了まで待機"""
        start = time.time()
        while self.active_requests:
            if time.time() - start > timeout:
                raise TimeoutError("リクエスト完了待機がタイムアウトしました")
            time.sleep(0.1)

handler = GracefulShutdownHandler()

def shutdown_handler(signum, frame):
    print("⚠️ シャットダウン信号受信 - リクエスト完了待機中...")
    handler.shutdown_event.set()
    # 最大30秒間待機
    try:
        handler.wait_for_completion(timeout=30.0)
    except TimeoutError:
        print("⚠️ タイムアウト - リクエストを強制終了")
    print("✅ グレースフルシャットダウン完了")

signal.signal(signal.SIGTERM, shutdown_handler)
signal.signal(signal.SIGINT, shutdown_handler)

def generate_text(prompt: str, model: str = "gpt-4o") -> Optional[str]:
    """推論リクエスト実行（グレースフル対応）"""
    request_id = f"req_{int(time.time() * 1000)}"
    handler.register_request(request_id)
    
    try:
        if handler.is_shutting_down():
            print(f"[{request_id}] シャットダウン中のためスキップ")
            return None
        
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            timeout=25.0
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"[{request_id}] エラー: {type(e).__name__}: {e}")
        raise
    finally:
        handler.complete_request(request_id)

使用例
if __name__ == "__main__":
    result = generate_text("Hello, world!")
    print(f"結果: {result}")

バッチ推論用のリトライ付き実装

import asyncio
import aiohttp
from typing import List, Dict, Any
from dataclasses import dataclass
from enum import Enum
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RequestStatus(Enum):
    PENDING = "pending"
    IN_PROGRESS = "in_progress"
    COMPLETED = "completed"
    FAILED = "failed"

@dataclass
class InferenceTask:
    task_id: str
    prompt: str
    status: RequestStatus = RequestStatus.PENDING
    result: Optional[str] = None
    error: Optional[str] = None
    retry_count: int = 0

class HolySheepBatchProcessor:
    def __init__(
        self,
        api_key: str,
        max_concurrent: int = 5,
        max_retries: int = 3,
        retry_delay: float = 1.0
    ):
        self.api_key = api_key
        self.max_concurrent = max_concurrent
        self.max_retries = max_retries
        self.retry_delay = retry_delay
        self._shutdown = False
        self._tasks: Dict[str, InferenceTask] = {}
        self._semaphore = asyncio.Semaphore(max_concurrent)
    
    async def _call_api(
        self,
        session: aiohttp.ClientSession,
        task: InferenceTask
    ) -> str:
        """HolySheep AI API呼び出し（リトライ付き）"""
        url = "https://api.holysheep.ai/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": task.prompt}],
            "temperature": 0.7
        }
        
        for attempt in range(self.max_retries):
            try:
                async with session.post(
                    url,
                    json=payload,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=30)
                ) as response:
                    if response.status == 200:
                        data = await response.json()
                        return data["choices"][0]["message"]["content"]
                    elif response.status == 429:
                        # レート制限 - リトライ
                        wait_time = self.retry_delay * (2 ** attempt)
                        logger.warning(f"レート制限 - {wait_time}秒待機")
                        await asyncio.sleep(wait_time)
                        continue
                    elif response.status >= 500:
                        # サーバーエラー - リトライ
                        await asyncio.sleep(self.retry_delay)
                        continue
                    else:
                        raise aiohttp.ClientResponseError(
                            response.request_info,
                            response.history,
                            status=response.status
                        )
            except asyncio.CancelledError:
                logger.info(f"[{task.task_id}] タスクがキャンセルされました")
                raise
            except Exception as e:
                if attempt == self.max_retries - 1:
                    raise
                logger.warning(f"[{task.task_id}] エラー (試行 {attempt + 1}): {e}")
                await asyncio.sleep(self.retry_delay)
        
        raise RuntimeError(f"最大リトライ回数を超過: {task.task_id}")
    
    async def _process_task(
        self,
        session: aiohttp.ClientSession,
        task: InferenceTask
    ):
        """単一タスク処理"""
        if self._shutdown:
            task.status = RequestStatus.FAILED
            task.error = "Shutdown in progress"
            return
        
        task.status = RequestStatus.IN_PROGRESS
        logger.info(f"[{task.task_id}] 処理開始")
        
        try:
            async with self._semaphore:
                if self._shutdown:
                    raise asyncio.CancelledError("Shutdown requested")
                result = await self._call_api(session, task)
                task.result = result
                task.status = RequestStatus.COMPLETED
                logger.info(f"[{task.task_id}] 完了")
        except asyncio.CancelledError:
            task.status = RequestStatus.FAILED
            task.error = "Cancelled during shutdown"
            logger.warning(f"[{task.task_id}] シャットダウンにより中断")
        except Exception as e:
            task.status = RequestStatus.FAILED
            task.error = str(e)
            logger.error(f"[{task.task_id}] 失敗: {e}")
    
    async def process_batch(
        self,
        prompts: List[str],
        graceful_timeout: float = 60.0
    ) -> List[InferenceTask]:
        """バッチ処理実行"""
        # タスク作成
        for i, prompt in enumerate(prompts):
            task = InferenceTask(
                task_id=f"task_{i}_{int(asyncio.get_event_loop().time() * 1000)}",
                prompt=prompt
            )
            self._tasks[task.task_id] = task
        
        connector = aiohttp.TCPConnector(limit=self.max_concurrent)
        async with aiohttp.ClientSession(connector=connector) as session:
            # 全タスクを並列実行
            tasks = [
                self._process_task(session, task)
                for task in self._tasks.values()
            ]
            
            # グレースフルシャットダウン用
            done, pending = await asyncio.wait(
                tasks,
                timeout=graceful_timeout,
                return_when=asyncio.ALL_COMPLETED
            )
            
            # 未完了タスクをキャンセル
            for task in pending:
                task.cancel()
            
            # キャンセル完了を待機
            if pending:
                await asyncio.gather(*pending, return_exceptions=True)
        
        return list(self._tasks.values())
    
    async def shutdown(self):
        """グレースフルシャットダウン開始"""
        logger.info("🔄 シャットダウン開始 - 新規リクエスト拒否...")
        self._shutdown = True
        
        # 実行中タスク完了まで待機（最大60秒）
        await asyncio.sleep(60.0)
        
        # 未完了タスクを強制終了
        for task in self._tasks.values():
            if task.status == RequestStatus.IN_PROGRESS:
                task.status = RequestStatus.FAILED
                task.error = "Force shutdown"
        
        logger.info("✅ シャットダウン完了")

使用例
async def main():
    processor = HolySheepBatchProcessor(
        api_key="YOUR_HOLYSHEEP_API_KEY",
        max_concurrent=3,
        max_retries=2
    )
    
    prompts = [
        "Pythonでリスト内の重複を削除方法は?",
        "FastAPIでミドルウェアの実装方法は?",
        "async/awaitのベストプラクティスは?"
    ]
    
    try:
        results = await processor.process_batch(prompts)
        for task in results:
            print(f"[{task.task_id}] {task.status.value}: {task.result or task.error}")
    except KeyboardInterrupt:
        print("\n⚠️ 割り込み検出")
        await processor.shutdown()

if __name__ == "__main__":
    asyncio.run(main())

HolySheep AI活用のポイント

HolySheep AIは登録だけで無料クレジットがもらえるため、本戦略のテスト環境に最適です。GPT-4.1 ($8/MTok)やClaude Sonnet 4.5 ($15/MTok)を含む主要モデルが一つのエンドポイントから利用でき、DeepSeek V3.2 ($0.42/MTok)を選べばコストを95%削減できます。

よくあるエラーと対処法

ConnectionError: timeout - リクエスト30秒超過
原因: ネットワーク遅延またはHolySheep AI側の過負荷
対処法: timeoutパラメータを30.0→60.0に拡大し、max_retriesを3に設定。aiohttpではClientTimeout(total=60)を使用。
```
# 修正例
client = OpenAI(
    api_key="YOUR_HOLYSHEEP_API_KEY",
    base_url="https://api.holysheep.ai/v1",
    timeout=60.0,  # 60秒に拡大
    max_retries=3   # 3回リトライ
)
```

401 Unauthorized - 認証エラー
原因: APIキーが無効または期限切れ
対処法: 環境変数からAPIキーを読み込み、有効性を検証。キーが空の場合は早期リターン。

import os

api_key = os.environ.get("HOLYSHEEP_API_KEY")
if not api_key or not api_key.startswith("sk-"):
    raise ValueError("無効なAPIキーです")

client = OpenAI(
    api_key=api_key,
    base_url="https://api.holysheep.ai/v1"
)

429 Too Many Requests - レート制限
原因: 短時間での大量リクエスト
対処法: asyncio.Semaphoreで同時実行数を制限（max_concurrent=5）。指数バックオフでリトライ。

async def _retry_with_backoff(func, max_retries=5):
    for i in range(max_retries):
        try:
            return await func()
        except aiohttp.ClientResponseError as e:
            if e.status == 429:
                wait = 2 ** i  # 指数バックオフ: 1s, 2s, 4s, 8s, 16s
                await asyncio.sleep(wait)
            else:
                raise
    raise RuntimeError("最大リトライ回数超過")

500 Internal Server Error - サーバー側エラー
原因: HolySheep AI側の内部障害
対処法: ステータスリトライ機構を実装。5xxエラーは自動的にリトライし、3回失敗した場合は代替エンドポイントにフェイルオーバー。
```
elif response.status >= 500:
    logger.warning(f"サーバーエラー 500+ (試行 {attempt + 1})")
    await asyncio.sleep(self.retry_delay * (attempt + 1))
    continue  # リトライ継続
```

Kubernetes環境でのDeployment設定

apiVersion: apps/v1
kind: Deployment
metadata:
  name: ai-inference-service
spec:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0  # 重要: ゼロ停止を実現
  template:
    spec:
      terminationGracePeriodSeconds: 90  # 重要: 90秒でゆっくり終了
      containers:
      - name: inference
        image: your-image:latest
        lifecycle:
          preStop:
            exec:
              command:
              - /bin/sh
              - -c
              - "sleep 10"  # 负载分散机制に猶予を与える
        env:
        - name: HOLYSHEEP_API_KEY
          valueFrom:
            secretKeyRef:
              name: holysheep-credentials
              key: api-key
        resources:
          limits:
            memory: "512Mi"
            cpu: "500m"
          requests:
            memory: "256Mi"
            cpu: "250m"

まとめ

グレースフルシャットダウンは、AI推論システムの可用性を担保する上で不可欠な戦略です。筆者が本番環境に適用した結果、シャットダウン時のエラー率を40秒→5秒以下に削減できました。

HolySheep AIの<50msレイテンシと¥1=$1の両替レートを活かし、経済的かつ高性能なAIインフラを構築しましょう。

👉 HolySheep AI に登録して無料クレジットを獲得

問題の背景：突然のシャットダウンによる503エラー

基本的なグレースフルシャットダウン実装

HolySheep AI クライアント初期化

使用例

バッチ推論用のリトライ付き実装

使用例

HolySheep AI活用のポイント

よくあるエラーと対処法

Kubernetes環境でのDeployment設定

まとめ

関連リソース

🔥 HolySheep AIを使ってみる