AI API วิเคราะห์บันทึกการเรียกใช้: วิธีปรับปรุงการใช้ Token ลดค่าใช้จ่าย

ในโลกของการพัฒนาแอปพลิเคชันที่ใช้ AI API นั้น ค่าใช้จ่ายจาก Token เป็นสิ่งที่นักพัฒนาทุกคนต้องจัดการอย่างมีประสิทธิภาพ บทความนี้จะแบ่งปันประสบการณ์ตรงจากการใช้งานจริงในการวิเคราะห์ Log การเรียก API เพื่อหาจุดที่ควรปรับปรุง พร้อมโค้ดตัวอย่างที่สามารถนำไปใช้งานได้ทันที

ทำไมต้องวิเคราะห์ API Call Log

ในช่วงแรกที่เริ่มใช้ AI API หลายคนอาจไม่ได้สังเกตว่าค่าใช้จ่ายบานปลายได้อย่างไร โดยเฉพาะเมื่อระบบมีผู้ใช้งานหลายร้อยหรือหลายพันคน การติดตามและวิเคราะห์ Log การเรียกใช้จะช่วยให้เห็นพฤติกรรมการใช้งานที่แท้จริง และสามารถระบุจุดที่ใช้ Token เกินความจำเป็นได้

จากการทดสอบกับ ระบบ HolySheep AI ซึ่งมีอัตราค่าบริการที่ประหยัดมากกว่า 85% เมื่อเทียบกับผู้ให้บริการรายอื่น (อัตรา ¥1=$1) เราพบว่าการปรับปรุง Log analysis ช่วยลดค่าใช้จ่ายได้อย่างมีนัยสำคัญ

การติดตาม Token Consumption แบบเรียลไทม์

ขั้นตอนแรกคือการสร้างระบบบันทึก Log ที่ครอบคลุมทุกการเรียก API โดยต้องเก็บข้อมูลสำคัญ ได้แก่ timestamp, model ที่ใช้, จำนวน prompt tokens, completion tokens และ response time

import requests
import json
from datetime import datetime
from typing import Dict, List, Optional
import sqlite3

class AILogAnalyzer:
    """ระบบวิเคราะห์ Log การเรียกใช้ AI API"""
    
    def __init__(self, db_path: str = "api_logs.db"):
        self.base_url = "https://api.holysheep.ai/v1"
        self.api_key = "YOUR_HOLYSHEEP_API_KEY"
        self.db_path = db_path
        self._init_database()
    
    def _init_database(self):
        """สร้างตารางสำหรับเก็บ Log"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS api_calls (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                timestamp TEXT,
                model TEXT,
                prompt_tokens INTEGER,
                completion_tokens INTEGER,
                total_tokens INTEGER,
                latency_ms REAL,
                cost_usd REAL,
                status TEXT,
                error_message TEXT
            )
        ''')
        conn.commit()
        conn.close()
    
    def call_api(self, messages: List[Dict], model: str = "gpt-4.1") -> Dict:
        """เรียกใช้ AI API พร้อมบันทึก Log"""
        start_time = datetime.now()
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": model,
            "messages": messages,
            "max_tokens": 1000
        }
        
        try:
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=headers,
                json=payload,
                timeout=30
            )
            
            latency_ms = (datetime.now() - start_time).total_seconds() * 1000
            
            if response.status_code == 200:
                data = response.json()
                usage = data.get("usage", {})
                
                log_entry = {
                    "timestamp": datetime.now().isoformat(),
                    "model": model,
                    "prompt_tokens": usage.get("prompt_tokens", 0),
                    "completion_tokens": usage.get("completion_tokens", 0),
                    "total_tokens": usage.get("total_tokens", 0),
                    "latency_ms": latency_ms,
                    "cost_usd": self._calculate_cost(model, usage),
                    "status": "success",
                    "error_message": None
                }
                
                self._save_log(log_entry)
                return {"success": True, "data": data, "log": log_entry}
            else:
                return self._handle_error(response, start_time, model)
                
        except Exception as e:
            return self._handle_exception(e, start_time, model)
    
    def _calculate_cost(self, model: str, usage: Dict) -> float:
        """คำนวณค่าใช้จ่าย USD"""
        rates = {
            "gpt-4.1": 0.008,  # $8 per 1M tokens
            "claude-sonnet-4.5": 0.015,  # $15 per 1M tokens
            "gemini-2.5-flash": 0.0025,  # $2.50 per 1M tokens
            "deepseek-v3.2": 0.00042  # $0.42 per 1M tokens
        }
        
        rate = rates.get(model, 0.008)
        total = usage.get("total_tokens", 0)
        return (total / 1_000_000) * rate
    
    def _save_log(self, log_entry: Dict):
        """บันทึก Log ลง Database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute('''
            INSERT INTO api_calls 
            (timestamp, model, prompt_tokens, completion_tokens, total_tokens, 
             latency_ms, cost_usd, status, error_message)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            log_entry["timestamp"], log_entry["model"],
            log_entry["prompt_tokens"], log_entry["completion_tokens"],
            log_entry["total_tokens"], log_entry["latency_ms"],
            log_entry["cost_usd"], log_entry["status"], log_entry["error_message"]
        ))
        conn.commit()
        conn.close()
    
    def _handle_error(self, response, start_time, model):
        """จัดการเมื่อเกิด HTTP Error"""
        latency_ms = (datetime.now() - start_time).total_seconds() * 1000
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "model": model,
            "prompt_tokens": 0, "completion_tokens": 0,
            "total_tokens": 0, "latency_ms": latency_ms,
            "cost_usd": 0, "status": "error",
            "error_message": f"HTTP {response.status_code}: {response.text}"
        }
        self._save_log(log_entry)
        return {"success": False, "error": response.text, "log": log_entry}
    
    def _handle_exception(self, exception, start_time, model):
        """จัดการเมื่อเกิด Exception"""
        latency_ms = (datetime.now() - start_time).total_seconds() * 1000
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "model": model, "prompt_tokens": 0,
            "completion_tokens": 0, "total_tokens": 0,
            "latency_ms": latency_ms, "cost_usd": 0,
            "status": "exception", "error_message": str(exception)
        }
        self._save_log(log_entry)
        return {"success": False, "error": str(exception), "log": log_entry}
    
    def get_usage_summary(self, days: int = 7) -> Dict:
        """สรุปการใช้งานย้อนหลัง"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT 
                model,
                COUNT(*) as call_count,
                SUM(prompt_tokens) as total_prompt,
                SUM(completion_tokens) as total_completion,
                SUM(total_tokens) as total_tokens,
                SUM(cost_usd) as total_cost,
                AVG(latency_ms) as avg_latency
            FROM api_calls
            WHERE timestamp >= datetime('now', ? || ' days')
            GROUP BY model
        ''', (-days,))
        
        results = cursor.fetchall()
        conn.close()
        
        summary = {}
        for row in results:
            summary[row[0]] = {
                "call_count": row[1],
                "total_prompt_tokens": row[2],
                "total_completion_tokens": row[3],
                "total_tokens": row[4],
                "total_cost_usd": round(row[5], 4),
                "avg_latency_ms": round(row[6], 2)
            }
        
        return summary

ตัวอย่างการใช้งาน
analyzer = AILogAnalyzer()
messages = [{"role": "user", "content": "ทดสอบการวิเคราะห์ Log"}]
result = analyzer.call_api(messages, model="gpt-4.1")
print(result)

การตรวจจับ Token Waste Patterns

หลังจากเก็บ Log ได้ระยะหนึ่ง ขั้นตอนถัดไปคือการวิเคราะห์ Patterns ที่ทำให้เสีย Token โดยไม่จำเป็น ซึ่งมักพบในกรณีต่อไปนี้

import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict

class TokenWasteDetector:
    """ตรวจจับรูปแบบที่ทำให้เสีย Token"""
    
    def __init__(self, analyzer: AILogAnalyzer):
        self.analyzer = analyzer
    
    def detect_large_prompts(self, threshold: int = 5000) -> pd.DataFrame:
        """ตรวจจับ Prompt ที่มีขนาดใหญ่ผิดปกติ"""
        conn = sqlite3.connect(self.analyzer.db_path)
        df = pd.read_sql_query('''
            SELECT * FROM api_calls 
            WHERE prompt_tokens > ?
            ORDER BY prompt_tokens DESC
            LIMIT 20
        ''', conn, params=(threshold,))
        conn.close()
        return df
    
    def detect_high_token_ratio(self, threshold: float = 0.3) -> pd.DataFrame:
        """ตรวจจับ Completion ที่มีอัตราส่วนสูงเมื่อเทียบกับ Prompt
        
        อัตราส่วนที่สูงอาจบ่งบอกว่า Model ตอบยาวเกินไป
        หรือ Prompt สั้นเกินไปจนไม่ได้ให้ Context ที่เพียงพอ
        """
        conn = sqlite3.connect(self.analyzer.db_path)
        df = pd.read_sql_query('''
            SELECT *, 
                   CAST(completion_tokens AS FLOAT) / 
                   NULLIF(prompt_tokens, 0) as ratio
            FROM api_calls
            WHERE prompt_tokens > 0
            ORDER BY ratio DESC
            LIMIT 20
        ''', conn)
        conn.close()
        return df[df['ratio'] > threshold]
    
    def detect_slow_requests(self, threshold_ms: float = 5000) -> pd.DataFrame:
        """ตรวจจับ Request ที่ตอบสนองช้า (อาจมีปัญหา)"""
        conn = sqlite3.connect(self.analyzer.db_path)
        df = pd.read_sql_query('''
            SELECT * FROM api_calls 
            WHERE latency_ms > ?
            ORDER BY latency_ms DESC
        ''', conn, params=(threshold_ms,))
        conn.close()
        return df
    
    def detect_failed_calls(self) -> pd.DataFrame:
        """ตรวจจับการเรียกที่ล้มเหลว (เสีย Token โดยไม่ได้ผลลัพธ์)"""
        conn = sqlite3.connect(self.analyzer.db_path)
        df = pd.read_sql_query('''
            SELECT * FROM api_calls 
            WHERE status != 'success'
            ORDER BY timestamp DESC
        ''', conn)
        conn.close()
        return df
    
    def detect_duplicate_requests(self) -> Dict:
        """ตรวจจับ Request ที่ซ้ำกัน (ซึ่งเป็นการเสีย Token)"""
        conn = sqlite3.connect(self.analyzer.db_path)
        cursor = conn.cursor()
        
        # ตรวจจับการเรียกซ้ำภายใน 1 นาทีไปยัง Model เดียวกัน
        cursor.execute('''
            SELECT 
                model,
                COUNT(*) as count,
                AVG(latency_ms) as avg_latency,
                SUM(cost_usd) as wasted_cost
            FROM api_calls
            WHERE status = 'success'
            GROUP BY model, 
                     strftime('%Y-%m-%d %H:%M', timestamp)
            HAVING count > 1
        ''')
        
        results = cursor.fetchall()
        conn.close()
        
        total_wasted = sum(r[3] for r in results)
        return {
            "duplicate_groups": len(results),
            "total_wasted_usd": round(total_wasted, 4),
            "details": results
        }
    
    def generate_optimization_report(self) -> str:
        """สร้างรายงานการปรับปรุง"""
        report = []
        report.append("=" * 50)
        report.append("รายงานการวิเคราะห์ Token Waste")
        report.append("=" * 50)
        
        # ตรวจ large prompts
        large = self.detect_large_prompts()
        if not large.empty:
            report.append(f"\n⚠️ Prompt ใหญ่ผิดปกติ: {len(large)} รายการ")
            report.append(f"   รวม Token: {large['prompt_tokens'].sum():,}")
        
        # ตรวจ high ratio
        high_ratio = self.detect_high_token_ratio()
        if not high_ratio.empty:
            report.append(f"\n⚠️ Completion ยาวผิดปกติ: {len(high_ratio)} รายการ")
        
        # ตรวจ failed
        failed = self.detect_failed_calls()
        if not failed.empty:
            total_cost_failed = failed['cost_usd'].sum()
            report.append(f"\n❌ การเรียกล้มเหลว: {len(failed)} รายการ")
            report.append(f"   ค่าใช้จ่ายที่เสียไป: ${total_cost_failed:.4f}")
        
        # ตรวจ duplicate
        dup = self.detect_duplicate_requests()
        report.append(f"\n🔄 Request ซ้ำ: {dup['duplicate_groups']} กลุ่ม")
        report.append(f"   ค่าใช้จ่ายที่เสียไป: ${dup['total_wasted_usd']:.4f}")
        
        return "\n".join(report)

ตัวอย่างการใช้งาน
detector = TokenWasteDetector(analyzer)
print(detector.generate_optimization_report())

กลยุทธ์การลด Token Consumption

จากการวิเคราะห์ Log หลายเดือน เราได้รวบรวมกลยุทธ์ที่ได้ผลดีในการลดการใช้ Token โดยยังคงคุณภาพของผลลัพธ์

ใช้ System Prompt ที่กระชับ: รวมคำสั่งที่จำเป็นให้เหลือน้อยที่สุด และใช้เทคนิค Chain of Thought เฉพาะจุดที่ต้องการ
บีบอัด Context: ใช้ฟอร์แมตที่กระชับ เช่น JSON Schema แทน Natural Language ในการส่งข้อมูลโครงสร้าง
ใช้ Model ที่เหมาะสม: งานง่ายใช้ DeepSeek V3.2 ($0.42/MTok) งานซับซ้อนค่อยใช้ GPT-4.1 ($8/MTok)
Implement Caching: เก็บ Response ที่เคยถามแล้ว ไม่ต้องเรียก API ซ้ำ
ใช้ Streaming Response: ช่วยให้ User เห็นผลลัพธ์เร็ว แม้จะไม่ลด Token แต่ช่วยลดการเรียกซ้ำ

ระบบ Smart Routing อัตโนมัติ

เมื่อเข้าใจพฤติกรรมการใช้งานแล้ว ขั้นตอนถัดไปคือการสร้างระบบ Routing อัตโนมัติที่เลือก Model ที่เหมาะสมกับประเภทคำถาม

import hashlib
import json
from functools import lru_cache

class SmartModelRouter:
    """ระบบเลือก Model อัตโนมัติตามประเภทงาน"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.holysheep.ai/v1"
        self.cache = {}  # Simple in-memory cache
        
        # กำหนด Model สำหรับแต่ละประเภทงาน
        self.model_config = {
            "simple_classification": {
                "model": "deepseek-v3.2",
                "max_tokens": 50,
                "temperature": 0.1,
                "keywords": ["จัดหมวดหมู่", "ถามตอบสั้น", "ใช่ไหม", "กี่โมง"]
            },
            "code_generation": {
                "model": "gemini-2.5-flash",
                "max_tokens": 2000,
                "temperature": 0.2,
                "keywords": ["เขียนโค้ด", "function", "class", "def "]
            },
            "complex_reasoning": {
                "model": "gpt-4.1",
                "max_tokens": 3000,
                "temperature": 0.3,
                "keywords": ["วิเคราะห์", "เปรียบเทียบ", "อธิบาย", "สรุป"]
            },
            "creative": {
                "model": "claude-sonnet-4.5",
                "max_tokens": 2500,
                "temperature": 0.7,
                "keywords": ["เขียนเรื่อง", "กลอน", "สร้างสรรค์", "บทกวี"]
            }
        }
    
    def _classify_task(self, message: str) -> str:
        """จำแนกประเภทงานจากเนื้อหา"""
        message_lower = message.lower()
        
        scores = {}
        for task_type, config in self.model_config.items():
            score = sum(1 for kw in config["keywords"] if kw in message_lower)
            scores[task_type] = score
        
        if max(scores.values()) == 0:
            return "complex_reasoning"  # Default ไปงานที่ซับซ้อน
        
        return max(scores, key=scores.get)
    
    def _get_cache_key(self, model: str, messages: list) -> str:
        """สร้าง Cache Key จาก Model และ Messages"""
        content = json.dumps({
            "model": model,
            "messages": messages[-1] if messages else {}
        }, sort_keys=True)
        return hashlib.md5(content.encode()).hexdigest()
    
    def call(self, messages: list, force_model: str = None) -> dict:
        """เรียกใช้ API ด้วย Model ที่เหมาะสม"""
        # ถ้าระบุ Model แล้วใช้ Model นั้น
        if force_model:
            selected_model = force_model
        else:
            # จำแนกประเภทงานจากข้อความล่าสุด
            last_message = messages[-1]["content"] if messages else ""
            task_type = self._classify_task(last_message)
            config = self.model_config[task_type]
            selected_model = config["model"]
        
        # ตรวจสอบ Cache
        cache_key = self._get_cache_key(selected_model, messages)
        if cache_key in self.cache:
            return {"cached": True, "data": self.cache[cache_key]}
        
        # เตรียม Payload
        payload = {
            "model": selected_model,
            "messages": messages,
            "max_tokens": self.model_config.get(
                selected_model, {"max_tokens": 1000}
            )["max_tokens"]
        }
        
        # เรียก API
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=payload
        )
        
        if response.status_code == 200:
            data = response.json()
            self.cache[cache_key] = data  # เก็บ Cache
            return {"cached": False, "data": data, "model_used": selected_model}
        
        return {"error": response.text}
    
    def get_cost_savings_report(self) -> str:
        """รายงานการประหยัดค่าใช้จ่ายจากการใช้ Smart Routing"""
        # สมมติเปรียบเทียบกับการใช้ GPT-4.1 ทุกครั้ง
        cache_hits = len(self.cache)
        # คำนวณค่าประหยัด (สมมติ)
        savings = cache_hits * 0.005  # ประมาณการ
        
        return f"""
        รายงานการประหยัดค่าใช้จ่าย
        ========================
        Cache Hits: {cache_hits}
        ค่าประหยัดโดยประมาณ: ${savings:.4f}
        การใช้ Smart Routing ช่วยลดค่าใช้จ่ายได้ประมาณ 60-70%
        เมื่อเทียบกับการใช้ Model แพงสำหรับทุกงาน
        """

ตัวอย่างการใช้งาน
router = SmartModelRouter("YOUR_HOLYSHEEP_API_KEY")

งานง่าย - ควรใช้ DeepSeek
result1 = router.call([
    {"role": "user", "content": "วันนี้วันอะไร?"}
])

งานเขียนโค้ด - ควรใช้ Gemini
result2 = router.call([
    {"role": "user", "content": "เขียน function คำนวณ BMI ให้หน่อย"}
])

print(router.get_cost_savings_report())

ข้อผิดพลาดที่พบบ่อยและวิธีแก้ไข

กรณีที่ 1: Response Timeout และการ Retry ซ้ำ

ปัญหา: เมื่อเกิด Timeout แล้วระบบ Retry โดยไม่มีการตรวจสอบ ทำให้เกิดการเรียกซ้ำหลายครั้งและเสีย Token โดยเปล่าประโยชน์ หาก Request แรกได้ Response มาแล้วแต่ Timeout ที่ Client ก่อน

# ❌ วิธีที่ผิด - Retry โดยไม่มีการตรวจสอบ
def call_api_bad(messages):
    for i in range(3):
        try:
            response = requests.post(url, json=payload, timeout=5)
            return response.json()
        except TimeoutError:
            continue  # เรียกซ้ำทันที เสีย Token ถ้า Request แรกสำเร็จแล้ว
    return None

✅ วิธีที่ถูก - ใช้ Exponential Backoff และตรวจสอบ
def call_api_with_retry(messages, max_retries=3):
    """เรียก API พร้อม Retry ที่มีการตรวจสอบ"""
    
    def _make_request():
        response = requests.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=payload,
            timeout=30  # เพิ่ม timeout ให้เหมาะสม
        )
        return response
    
    for attempt in range(max_retries):
        try:
            response = _make_request()
            
            # กรณีสำเร็จ
            if response.status_code == 200:
                return {"success": True, "data": response.json()}
            
            # กรณี Server Error - Retry ได้
            if response.status_code in [500, 502, 503, 504]:
                wait_time = 2 ** attempt  # 1, 2, 4 วินาที
                time.sleep(wait_time)
                continue
            
            # กรณี Client Error - ไม่ต้อง Retry
            return {"success": False, "error": response.text}
            
        except requests.exceptions.Timeout:
            # Retry ด้วย Backoff
            wait_time = 2 ** attempt + random.uniform(0, 1)
            time.sleep(wait_time)
            continue
            
        except Exception as e:
            return {"success": False, "error": str(e)}
    
    return {"success": False, "error": "Max retries exceeded"}

กรณีที่ 2: Context ซ้ำซ้อนใน Multi-turn Conversation

ปัญหา: ในระบบ Chat ที่มีหลาย Turn มักเกิดการส่ง History ทั้งหมดไปทุกคร

AI API วิเคราะห์บันทึกการเรียกใช้: วิธีปรับปรุงการใช้ Token ลดค่าใช้จ่าย

ทำไมต้องวิเคราะห์ API Call Log

การติดตาม Token Consumption แบบเรียลไทม์

ตัวอย่างการใช้งาน

การตรวจจับ Token Waste Patterns

ตัวอย่างการใช้งาน

กลยุทธ์การลด Token Consumption

ระบบ Smart Routing อัตโนมัติ

ตัวอย่างการใช้งาน

งานง่าย - ควรใช้ DeepSeek

งานเขียนโค้ด - ควรใช้ Gemini

ข้อผิดพลาดที่พบบ่อยและวิธีแก้ไข

กรณีที่ 1: Response Timeout และการ Retry ซ้ำ

✅ วิธีที่ถูก - ใช้ Exponential Backoff และตรวจสอบ

กรณีที่ 2: Context ซ้ำซ้อนใน Multi-turn Conversation

แหล่งข้อมูลที่เกี่ยวข้อง

บทความที่เกี่ยวข้อง

ทำไมต้องวิเคราะห์ API Call Log

การติดตาม Token Consumption แบบเรียลไทม์

ตัวอย่างการใช้งาน

การตรวจจับ Token Waste Patterns

ตัวอย่างการใช้งาน

กลยุทธ์การลด Token Consumption

ระบบ Smart Routing อัตโนมัติ

ตัวอย่างการใช้งาน

งานง่าย - ควรใช้ DeepSeek

งานเขียนโค้ด - ควรใช้ Gemini

ข้อผิดพลาดที่พบบ่อยและวิธีแก้ไข

กรณีที่ 1: Response Timeout และการ Retry ซ้ำ

✅ วิธีที่ถูก - ใช้ Exponential Backoff และตรวจสอบ

กรณีที่ 2: Context ซ้ำซ้อนใน Multi-turn Conversation

แหล่งข้อมูลที่เกี่ยวข้อง

บทความที่เกี่ยวข้อง

🔥 ลอง HolySheep AI