ReAct Agent ใน Production: 4 บทเรียนสำคัญจาก Demo สู่บริการที่เสถียร

ในฐานะที่ผมเคยพัฒนา AI Agent มาหลายตัว ต้องบอกว่า ReAct (Reasoning + Acting) เป็น pattern ที่ดูน่าทึ่งใน demo แต่พอเอาเข้าจริง... บทความนี้จะเล่าประสบการณ์ตรงจากการ deploy ReAct Agent ให้กับระบบ E-commerce ที่รับ traffic หลักหมื่นคนต่อวัน พร้อมวิธีแก้ปัญหาที่ได้ลองผิดลองถูกมาแล้ว

ทำไม ReAct Agent ถึง "พัง" ใน Production

ReAct Agent ทำงานโดยการ loop ระหว่าง reasoning และ acting ซึ่งในสภาพแวดล้อมจริงมีปัญหาหลายอย่างที่ demo ไม่เคยเจอ:

Latency สะสม: แต่ละ step ต้อง call LLM 1 ครั้ง ถ้า 10 steps = 10 calls = รอนานเกินไป
Token explosion: history ของ reasoning ถูกส่งไปทุกครั้ง ทำให้ cost พุ่งกระฉูด
Tool failure cascade: tool ตัวหนึ่งพัง = ทั้ง chain พัง
Non-deterministic output: LLM output ไม่ตรง format = crash ทันที

กรณีศึกษา: ระบบ Customer Service AI ของ E-commerce

เราเคย deploy ReAct Agent สำหรับตอบคำถามลูกค้าเกี่ยวกับ:

สถานะออเดอร์
นโยบายการคืนสินค้า
โปรโมชั่นปัจจุบัน
คำแนะนำสินค้า

ใน demo ระบบทำงานได้ดีมาก แต่พอเปิดให้ลูกค้าจริง 200+ concurrent users... เซิร์ฟเวอร์ล่ม ค่าใช้จ่ายพุ่ง 300% ในเดือนแรก และมี incident ที่ agent "หลุดโลก" ตอบคำถามลูกค้าผิดหลายสิบครั้ง

บทเรียนที่ 1: ต้องมี Max Steps และ Timeout

ปัญหาแรกที่เจอคือ agent ติด loop ไม่รู้จบ บางทีถามเรื่องสถานะออเดอร์แล้ว agent พยายาม verify ซ้ำแล้วซ้ำเล่า ใช้เวลา 45 วินาทีต่อ 1 คำถาม

import time
from typing import Literal

class ReActAgentWithGuardrails:
    def __init__(
        self,
        client,
        model: str = "gpt-4.1",
        max_steps: int = 5,
        step_timeout: float = 10.0,
        total_timeout: float = 30.0
    ):
        self.client = client
        self.model = model
        self.max_steps = max_steps
        self.step_timeout = step_timeout
        self.total_timeout = total_timeout
        self.start_time = None

    def run(self, query: str, tools: list) -> dict:
        self.start_time = time.time()
        messages = [{"role": "user", "content": query}]
        step_count = 0
        final_answer = None

        while step_count < self.max_steps:
            # Check total timeout
            if time.time() - self.start_time > self.total_timeout:
                return {
                    "status": "timeout",
                    "answer": "ขออภัย คำถามของคุณใช้เวลานานเกินไป กรุณาถามใหม่อย่างกระชับขึ้น",
                    "steps_used": step_count
                }

            # Generate next action with timeout
            try:
                response = self._call_llm(messages, tools, step_count)
            except TimeoutError:
                return {
                    "status": "step_timeout",
                    "answer": "ระบบกำลังพยายามประมวลผล กรุณารอสักครู่",
                    "steps_used": step_count
                }

            if response["type"] == "answer":
                final_answer = response["content"]
                break

            # Execute tool with individual timeout
            tool_result = self._execute_tool(
                response["tool"],
                response["args"],
                timeout=self.step_timeout
            )

            messages.append(response["message"])
            messages.append({
                "role": "tool",
                "tool_call_id": response["message"]["tool_calls"][0]["id"],
                "content": tool_result
            })

            step_count += 1

        return {
            "status": "success" if final_answer else "max_steps_reached",
            "answer": final_answer or "ไม่สามารถตอบคำถามนี้ได้",
            "steps_used": step_count
        }

ผลลัพธ์: เวลาตอบสนองเฉลี่ยลดจาก 45 วินาทีเหลือ 3.2 วินาที, ไม่มี request ที่ hang อีกเลย

บทเรียนที่ 2: Tool Output Parsing ต้อง Robust

ReAct Agent พึ่งพา LLM output เป็นหลัก ปัญหาคือ LLM บางครั้ง output JSON ผิด format, ลืม quote, หรือหลุด newline ตรงกลาง

import json
import re
from typing import Optional

class ToolOutputParser:
    """Robust parser สำหรับ ReAct Agent - รองรับ JSON ที่ LLM พิมพ์ผิด"""

    def __init__(self, required_fields: list[str]):
        self.required_fields = required_fields

    def parse(self, llm_output: str) -> Optional[dict]:
        # Strategy 1: Clean JSON with regex
        cleaned = self._preprocess(llm_output)

        # Try standard JSON parse
        try:
            result = json.loads(cleaned)
            return self._validate(result)
        except json.JSONDecodeError:
            pass

        # Strategy 2: Extract from markdown code blocks
        code_match = re.search(r'``(?:json)?\s*(\{[\s\S]*?\})\s*``', cleaned)
        if code_match:
            try:
                result = json.loads(code_match.group(1))
                return self._validate(result)
            except json.JSONDecodeError:
                pass

        # Strategy 3: Aggressive cleanup
        try:
            result = self._aggressive_parse(cleaned)
            return self._validate(result)
        except Exception:
            return None

    def _preprocess(self, text: str) -> str:
        """Preprocess ก่อน parse"""
        # Remove markdown code blocks
        text = re.sub(r'```json\s*', '', text)
        text = re.sub(r'```\s*', '', text)

        # Fix common LLM mistakes
        # Single quotes to double quotes
        text = re.sub(r"(\{|,)\s*'([^']+)'\s*:", r'\1 "\2":', text)
        text = re.sub(r":\s*'([^']*)'\s*(,|\})", r': "\1"\2', text)

        # Trailing commas
        text = re.sub(r',(\s*[}\]])', r'\1', text)

        # Fix unquoted keys
        text = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1 "\2":', text)

        return text.strip()

    def _aggressive_parse(self, text: str) -> dict:
        """Aggressive parsing สำหรับ broken JSON"""
        # Find JSON-like structure
        start = text.find('{')
        end = text.rfind('}') + 1
        if start == -1 or end == 0:
            raise ValueError("No JSON structure found")

        json_str = text[start:end]

        # Try to parse with Python's ast as fallback
        # For simplicity, just return what we can extract
        result = {}

        # Extract key-value pairs with regex
        pairs = re.findall(r'"([^"]+)"\s*:\s*("([^"]*)"|(\d+\.?\d*)|(\w+))', json_str)
        for key, full_val, str_val, num_val, bool_val in pairs:
            if str_val:
                result[key] = str_val
            elif num_val:
                result[key] = float(num_val) if '.' in num_val else int(num_val)
            elif bool_val in ('true', 'false', 'null'):
                result[key] = {'true': True, 'false': False, 'null': None}[bool_val]

        return result

    def _validate(self, result: dict) -> Optional[dict]:
        """Validate ว่ามี required fields ครบ"""
        for field in self.required_fields:
            if field not in result:
                return None
        return result


Usage with error handling
def execute_tool_safely(tool_name: str, args: dict, parser: ToolOutputParser):
    """Execute tool พร้อม robust error handling"""
    try:
        raw_output = call_tool(tool_name, args)
        parsed = parser.parse(raw_output)

        if parsed is None:
            return {
                "success": False,
                "error": "parse_failed",
                "raw_output": raw_output[:500]  # Limit stored output
            }

        return {"success": True, "data": parsed}

    except json.JSONDecodeError as e:
        return {
            "success": False,
            "error": "json_decode_error",
            "detail": str(e),
            "position": e.pos
        }
    except Exception as e:
        return {
            "success": False,
            "error": "unknown",
            "detail": str(e)
        }

ผลลัพธ์: Parse error rate ลดจาก 8% เหลือ 0.3%, ไม่มี crash จาก malformed JSON อีกเลย

บทเรียนที่ 3: Cost Optimization ด้วย Strategic Context

ปัญหาใหญ่ที่สุดคือ cost และ latency เพราะ ReAct ส่ง full conversation history ไปทุก step

import tiktoken
from dataclasses import dataclass
from typing import Optional

@dataclass
class ContextWindow:
    """Context window manager สำหรับ ReAct Agent"""

    model: str
    max_tokens: int
    reserved_tokens: int = 500  # For response

    def __post_init__(self):
        try:
            self.encoding = tiktoken.encoding_for_model(self.model)
        except:
            self.encoding = tiktoken.get_encoding("cl100k_base")

    def count_tokens(self, text: str) -> int:
        return len(self.encoding.encode(text))

    def build_messages(
        self,
        system_prompt: str,
        conversation_history: list,
        recent_tool_results: list,
        current_query: str,
        max_history_turns: int = 3
    ) -> list:
        """
        Build context แบบ strategic - เก็บแค่ที่จำเป็น
        """
        messages = []

        # System prompt (compact)
        available = self.max_tokens - self.reserved_tokens
        system_compact = self._compact_prompt(system_prompt, available // 3)
        messages.append({"role": "system", "content": system_compact})

        # Recent conversation (limited turns)
        turns_used = 0
        for msg in reversed(conversation_history):
            if turns_used >= max_history_turns:
                break
            msg_tokens = self.count_tokens(str(msg))
            if available - msg_tokens < 200:
                break
            messages.insert(1, msg)
            available -= msg_tokens
            turns_used += 1

        # Recent tool results (summarized if too long)
        tool_context = self._summarize_tool_results(recent_tool_results, available // 3)
        if tool_context:
            messages.append({
                "role": "system",
                "content": f"[Context from recent actions]\n{tool_context}"
            })

        # Current query
        messages.append({"role": "user", "content": current_query})

        return messages

    def _compact_prompt(self, prompt: str, max_tokens: int) -> str:
        """Compact prompt โดยตัด example ออก"""
        if self.count_tokens(prompt) <= max_tokens:
            return prompt

        # Keep first part + key instructions
        lines = prompt.split('\n')
        kept_lines = []
        tokens_so_far = 0

        for line in lines:
            line_tokens = self.count_tokens(line)
            if tokens_so_far + line_tokens > max_tokens:
                break
            kept_lines.append(line)
            tokens_so_far += line_tokens

        return '\n'.join(kept_lines)

    def _summarize_tool_results(
        self,
        results: list,
        max_tokens: int
    ) -> str:
        """Summarize tool results ให้กระชับ"""
        if not results:
            return ""

        summary_parts = []
        tokens_used = 0

        for result in results[-3:]:  # Max 3 recent results
            # Keep only key info
            if isinstance(result, dict):
                if "status" in result:
                    key_info = f"- {result.get('type', 'unknown')}: {result.get('status')}"
                elif "content" in result:
                    content = result["content"][:200]  # Truncate long content
                    key_info = f"- Result: {content}..."
                else:
                    continue
            else:
                key_info = f"- {str(result)[:200]}"

            part_tokens = self.count_tokens(key_info)
            if tokens_used + part_tokens > max_tokens:
                break

            summary_parts.append(key_info)
            tokens_used += part_tokens

        return '\n'.join(summary_parts)


Real implementation with HolySheep API
def create_optimized_react_agent(api_key: str):
    """
    ReAct Agent ที่ optimize สำหรับ production
    ใช้ HolySheep API ราคาถูกกว่า 85%+
    """
    from openai import OpenAI

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.holysheep.ai/v1"  # HolySheep endpoint
    )

    # Use DeepSeek V3.2 - ราคาเพียง $0.42/MTok
    # เร็ว & ถูก เหมาะสำหรับ tool-calling loops
    context_manager = ContextWindow(
        model="deepseek-v3.2",
        max_tokens=32000,
        reserved_tokens=1000
    )

    def agent(query: str, tools: list, history: list) -> dict:
        # Build optimized context
        messages = context_manager.build_messages(
            system_prompt=get_system_prompt(),
            conversation_history=history[-6:],  # Last 3 turns
            recent_tool_results=history[-2:],   # Last tool results
            current_query=query
        )

        response = client.chat.completions.create(
            model="deepseek-v3.2",
            messages=messages,
            tools=tools,
            temperature=0.1  # Low temp for consistency
        )

        return response

    return agent

ผลลัพธ์: Token usage ลดลง 60%, Cost ลดจาก $2.40/1000 requests เหลือ $0.35/1000 requests

บทเรียนที่ 4: Circuit Breaker และ Fallback Strategy

ไม่มีระบบ AI ไหน perfect ดังนั้นต้องเตรียม fallback ไว้เสมอ

import time
from functools import wraps
from typing import Callable, Any
import logging

logger = logging.getLogger(__name__)

class CircuitBreaker:
    """
    Circuit Breaker pattern สำหรับ ReAct Agent
    ป้องกัน cascade failure เมื่อ LLM หรือ tool มีปัญหา
    """

    def __init__(
        self,
        failure_threshold: int = 3,
        recovery_timeout: float = 30.0,
        expected_exception: type = Exception
    ):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.expected_exception = expected_exception
        self.failures = 0
        self.last_failure_time = None
        self.state = "closed"  # closed, open, half-open

    def call(self, func: Callable, *args, **kwargs) -> Any:
        if self.state == "open":
            if time.time() - self.last_failure_time > self.recovery_timeout:
                self.state = "half-open"
                logger.info("Circuit breaker: switching to half-open")
            else:
                raise CircuitBreakerOpenError("Circuit breaker is open")

        try:
            result = func(*args, **kwargs)

            if self.state == "half-open":
                self.state = "closed"
                self.failures = 0
                logger.info("Circuit breaker: recovered to closed")

            return result

        except self.expected_exception as e:
            self.failures += 1
            self.last_failure_time = time.time()

            if self.failures >= self.failure_threshold:
                self.state = "open"
                logger.warning(f"Circuit breaker: opened after {self.failures} failures")

            raise

    def reset(self):
        self.failures = 0
        self.state = "closed"
        self.last_failure_time = None


class CircuitBreakerOpenError(Exception):
    pass


class ReActAgentWithFallback:
    """
    ReAct Agent พร้อม Circuit Breaker และ Fallback
    """

    def __init__(self, api_key: str):
        from openai import OpenAI

        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.holysheep.ai/v1"
        )

        # Circuit breakers for different components
        self.llm_circuit = CircuitBreaker(failure_threshold=5)
        self.tool_circuit = CircuitBreaker(failure_threshold=3)

        # Fallback responses
        self.fallback_responses = {
            "order_status": "ขออภัยค่ะ ระบบตรวจสอบออเดอร์ไม่สามารถเข้าถึงได้ในขณะนี้ กรุณาลองใหม่ในอีกสักครู่ หรือติดต่อแผนกบริการลูกค้าโดยตรงที่หมายเลข 02-xxx-xxxx",
            "product_recommendation": "ขออภัยค่ะ ไม่สามารถแนะนำสินค้าได้ในขณะนี้ กรุณาเลือกชมสินค้าในหมวดหมู่ที่สนใจได้เลยค่ะ",
            "general": "ขออภัยค่ะ ระบบ AI กำลังมีปัญหา กรุณาถามใหม่อีกครั้ง หรือติดต่อเจ้าหน้าที่โดยตรง"
        }

    def run_with_fallback(
        self,
        query: str,
        intent: str,
        tools: list,
        max_retries: int = 2
    ) -> dict:
        """
        Run ReAct Agent พร้อม fallback เมื่อเกิดปัญหา
        """
        for attempt in range(max_retries):
            try:
                # Try with circuit breaker
                result = self.llm_circuit.call(
                    self._run_react,
                    query,
                    tools
                )
                return {"status": "success", "data": result}

            except CircuitBreakerOpenError:
                logger.warning("LLM circuit breaker is open")
                break

            except Exception as e:
                logger.error(f"Attempt {attempt + 1} failed: {e}")

                if attempt < max_retries - 1:
                    # Exponential backoff
                    wait_time = (2 ** attempt) * 1.0
                    time.sleep(wait_time)
                else:
                    # All retries exhausted, use fallback
                    logger.warning(f"All retries exhausted, using fallback for intent: {intent}")

                    return {
                        "status": "fallback",
                        "data": {
                            "answer": self.fallback_responses.get(
                                intent,
                                self.fallback_responses["general"]
                            ),
                            "fallback_used": True,
                            "original_error": str(e)
                        }
                    }

        # Circuit breaker triggered
        return {
            "status": "circuit_breaker",
            "data": {
                "answer": self.fallback_responses.get(
                    intent,
                    self.fallback_responses["general"]
                ),
                "circuit_breaker_triggered": True
            }
        }

    def _run_react(self, query: str, tools: list) -> dict:
        """Core ReAct logic"""
        messages = [{"role": "user", "content": query}]

        for step in range(5):
            response = self.client.chat.completions.create(
                model="gpt-4.1",
                messages=messages,
                tools=tools,
                temperature=0.1
            )

            if response.choices[0].finish_reason == "tool_calls":
                # Execute tool
                for tool_call in response.choices[0].message.tool_calls:
                    tool_result = self.tool_circuit.call(
                        execute_tool,
                        tool_call.function.name,
                        json.loads(tool_call.function.arguments)
                    )

                    messages.append(response.choices[0].message)
                    messages.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "content": str(tool_result)
                    })
            else:
                # Final answer
                return {"answer": response.choices[0].message.content}

        raise Exception("Max steps reached")

ข้อผิดพลาดที่พบบ่อยและวิธีแก้ไข

1. JSON Output จาก LLM พิมพ์ผิด Format

อาการ: LLM return ข้อความที่ไม่ใช่ JSON สมบูรณ์ เช่น ลืม closing brace หรือใช้ single quote

# ❌ วิธีที่ทำให้ระบบพัง
response = client.chat.completions.create(...)
result = json.loads(response.choices[0].message.content)  # Crash!

✅ วิธีที่ถูกต้อง
def safe_json_parse(text: str) -> dict:
    # Preprocess
    text = text.strip()
    if text.startswith("```json"):
        text = text[7:]
    if text.startswith("```"):
        text = text[3:]
    text = text.strip("`").strip()

    # Fix common mistakes
    text = text.replace("'", '"')
    text = re.sub(r',(\s*[}\]])', r'\1', text)  # Remove trailing comma

    try:
        return json.loads(text)
    except json.JSONDecodeError as e:
        # Fallback to regex extraction
        return extract_with_regex(text)

2. Token Limit เกินเพราะ History สะสม

อาการ: หลังใช้งานไปสักพัก LLM return error "context_length_exceeded"

# ❌ วิธีที่ทำให้ token พุ่ง
messages.append({"role": "user", "content": query})
for old_msg in conversation_history:
    messages.append(old_msg)  # สะสมไปเรื่อยๆ

✅ วิธีที่ถูกต้อง - จำกัด context อย่างเคร่งครัด
MAX_TOKENS = {
    "gpt-4.1": 120000,  # Reserve 10k for response
    "claude-sonnet-4.5": 180000,
    "deepseek-v3.2": 60000
}

def build_context(messages: list, model: str, query: str) -> list:
    limit = MAX_TOKENS.get(model, 30000)
    result = [{"role": "system", "content": SYSTEM_PROMPT}]

    # Add recent messages only (last 4)
    for msg in messages[-4:]:
        result.append(msg)

    result.append({"role": "user", "content": query})

    # If still too long, truncate oldest messages
    while estimate_tokens(result) > limit:
        if len(result) <= 3:
            # Truncate current query
            result[-1]["content"] = result[-1]["content"][:500]
            break
        result.pop(1)  # Remove oldest non-system message

    return result

3. Tool Call Timeout ทำให้ Request ค้าง

อาการ: Agent รอ tool response ที่ไม่มีวันกลับมา ทำให้ request ค้างนาน

# ❌ วิธีที่ทำให้ request ค้าง
def call_tool(tool_name: str, args: dict):
    result = requests.post(f"{API_URL}/{tool_name}", json=args)
    return result.json()  # รอ forever ถ้า API ล่ม

✅ วิธีที่ถูกต้อง - พร้อม timeout และ retry
from requests.exceptions import RequestException, Timeout

def call_tool_with_timeout(
    tool_name: str,
    args: dict,
    timeout: float = 5.0,
    max_retries: int = 2
) -> dict:
    url = f"{API_URL}/{tool_name}"

    for attempt in range(max_retries):
        try:
            response = requests.post(
                url,
                json=args,
                timeout=timeout
            )
            response.raise_for_status()
            return response.json()

        except Timeout:
            if attempt == max_retries - 1:
                return {
                    "error": "tool_timeout",
                    "tool": tool_name,
                    "message": f"Tool took longer than {timeout}s"
                }
            time.sleep(1 * (attempt + 1))  # Exponential backoff

        except RequestException as e:
            if attempt == max_retries - 1:
                return {
                    "error": "tool_unavailable",
                    "tool": tool_name,
                    "message": str(e)
                }
            time.sleep(1 * (attempt + 1))

4. LLM API Error ไม่ได้จัดการ

อาการ: API key หมด, rate limit, หรือ server error ทำให้ระบบ crash

# ❌ วิธีที่ทำให้ระบบ crash
response = client.chat.completions.create(
    model="gpt-4.1",
    messages=messages
)
return response.choices[0].message.content

✅ วิธีที่ถูกต้อง - จัดการทุก error case
from openai import RateLimitError, AuthenticationError, APIError

def call_llm_with_error_handling(
    client,
    model: str,
    messages: list,
    fallback_model: str = "deepseek-v3.2"
) -> str:
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages
        )
        return response.choices[0].message.content

    except AuthenticationError:
        logger.error("Invalid API key")
        return "ระบบกำลังมีปัญหา กรุณาติดต่อผู้ดูแลระบบ"

    except RateLimitError:
        logger.warning(f"Rate limit hit for {model}, trying {fallback_model}")
        # Try fallback model
        try:
            response = client.chat.completions.create(
                model=fallback_model,
                messages=messages
            )
            return response.choices[0].message.content
        except Exception:
            return "ระบบมีผู้ใช้งานมาก กรุณารอสักครู่แล้วลองใหม่"

    except APIError as e:
        logger.error(f"API error: {e}")
        if e.status_code >= 500:
            return "เซิร์ฟเวอร์ก
แหล่งข้อมูลที่เกี่ยวข้อง
📚 บทช่วยสอน AI API
💰 ดูราคา
📖 เอกสารสำหรับนักพัฒนา
🚀 สมัครฟรี
บทความที่เกี่ยวข้อง
Claude Opus 4.6 vs GPT-5.2：2026 การทดสอบความสามารถเขียนโค้ดข
Binance vs OKX vs Bybit 2026: API เปรียบเทียบสำหรับ Quant Tr
AI Agent Framework 2026: คู่มือเปรียบเทียบ LangGraph vs Crew

ทำไม ReAct Agent ถึง "พัง" ใน Production

กรณีศึกษา: ระบบ Customer Service AI ของ E-commerce

บทเรียนที่ 1: ต้องมี Max Steps และ Timeout

บทเรียนที่ 2: Tool Output Parsing ต้อง Robust

Usage with error handling

บทเรียนที่ 3: Cost Optimization ด้วย Strategic Context

Real implementation with HolySheep API

บทเรียนที่ 4: Circuit Breaker และ Fallback Strategy

ข้อผิดพลาดที่พบบ่อยและวิธีแก้ไข

1. JSON Output จาก LLM พิมพ์ผิด Format

✅ วิธีที่ถูกต้อง

2. Token Limit เกินเพราะ History สะสม

✅ วิธีที่ถูกต้อง - จำกัด context อย่างเคร่งครัด

3. Tool Call Timeout ทำให้ Request ค้าง

✅ วิธีที่ถูกต้อง - พร้อม timeout และ retry

4. LLM API Error ไม่ได้จัดการ

✅ วิธีที่ถูกต้อง - จัดการทุก error case

แหล่งข้อมูลที่เกี่ยวข้อง

บทความที่เกี่ยวข้อง

🔥 ลอง HolySheep AI