add rag for platform_knowledge_base

2025-08-23 22:44:16 +01:00 · 2025-08-23 22:44:16 +01:00 · 2554150b29
commit 2554150b29
parent 5d93ac5a32
4 changed files with 690 additions and 12 deletions
--- a/api/products/aichat.py
+++ b/api/products/aichat.py
@ -80,6 +80,21 @@ class AIChatService:
                        "required": []
                    }
                }
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "search_knowledge_base",
+                    "description": "从平台知识库中检索相关信息",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {"type": "string", "description": "搜索查询"},
+                            "max_results": {"type": "integer", "description": "最大结果数", "default": 3}
+                        },
+                        "required": ["query"]
+                    }
+                }
            }
        ]
    
@ -98,16 +113,23 @@ class AIChatService:
 4. 品牌数字化服务

 服务原则：
-1. 专业、友好、准确回答用户问题
-2. 优先使用官方信息回答
-3. 对于不确定的信息，明确告知用户
-4. 引导用户使用相关功能和服务
-5. 当用户需要验证产品真伪时，主动提供二维码扫描功能
+1. 严谨谦虚，基于知识库和真实数据回答，绝不捏造事实
+2. 优先使用官方信息和知识库内容回答
+3. 对于不确定或超出知识范围的信息，明确告知用户"我不确定"或"需要进一步确认"
+4. 如果知识库中没有相关信息，诚实告知用户，不要推测或编造
+5. 引导用户使用相关功能和服务
+6. 当用户需要验证产品真伪时，主动提供二维码扫描功能

 可用工具：
 - start_qr_scan: 启动二维码扫描功能，用于产品防伪验证
+- search_knowledge_base: 从平台知识库中检索相关信息，用于回答专业问题

-请根据用户问题提供准确、有用的回答。如果用户需要验证产品，请使用二维码扫描工具。"""
+回答要求：
+- 严格基于知识库检索结果和真实数据回答
+- 如果知识库中没有相关信息，请明确告知用户
+- 对于专业问题，优先使用知识库搜索工具获取准确信息
+- 保持严谨谦虚的态度，不夸大、不推测、不编造
+- 如果用户需要验证产品，请使用二维码扫描工具"""
        
        elif self.chat_type == 'product':
            system_prompt = f"""你是一个专业的徵象防伪验证平台产品客服助手，专门为产品ID {self.product_id} 提供客服服务。
@ -120,16 +142,23 @@ class AIChatService:
 - 服务类型: 产品专属客服

 服务原则：
-1. 专业、友好、准确回答用户关于该产品的问题
-2. 优先使用产品相关信息回答
-3. 对于不确定的信息，明确告知用户
-4. 引导用户使用相关功能和服务
-5. 当用户需要验证产品真伪时，主动提供二维码扫描功能
+1. 严谨谦虚，基于知识库和真实数据回答，绝不捏造事实
+2. 优先使用产品相关信息和知识库内容回答
+3. 对于不确定或超出知识范围的信息，明确告知用户"我不确定"或"需要进一步确认"
+4. 如果知识库中没有相关信息，诚实告知用户，不要推测或编造
+5. 引导用户使用相关功能和服务
+6. 当用户需要验证产品真伪时，主动提供二维码扫描功能

 可用工具：
 - start_qr_scan: 启动二维码扫描功能，用于产品防伪验证
+- search_knowledge_base: 从平台知识库中检索相关信息，用于回答专业问题

-请根据用户问题提供准确、有用的回答。如果用户需要验证产品，请使用二维码扫描工具。"""
+回答要求：
+- 严格基于知识库检索结果和真实数据回答
+- 如果知识库中没有相关信息，请明确告知用户
+- 对于专业问题，优先使用知识库搜索工具获取准确信息
+- 保持严谨谦虚的态度，不夸大、不推测、不编造
+- 如果用户需要验证产品，请使用二维码扫描工具"""
        
        self.conversation_history.append({
            "role": "system",
@ -153,6 +182,8 @@ class AIChatService:
        
        if function_name == "start_qr_scan":
            return self._execute_qr_scan(tool_call.id)
+        elif function_name == "search_knowledge_base":
+            return self._execute_search_knowledge_base(tool_call.id, tool_call.function)
        else:
            return ToolResult(
                tool_call_id=tool_call.id,
@ -192,6 +223,108 @@ class AIChatService:
            content=json.dumps(result, ensure_ascii=False)
        )
    
+    def _execute_search_knowledge_base(self, tool_call_id: str, function_data) -> ToolResult:
+        """执行知识库搜索工具"""
+        print(f"\n🔍 开始执行知识库搜索工具...")
+        print(f"📊 工具调用ID: {tool_call_id}")
+        print(f"📋 函数数据: {function_data}")
+        
+        try:
+            # 解析函数参数
+            print(f"🔧 解析函数参数...")
+            if hasattr(function_data, 'arguments'):
+                arguments = function_data.arguments
+                print(f"✅ 从function_data.arguments获取参数: {arguments}")
+            else:
+                arguments = function_data.get('arguments', '{}')
+                print(f"✅ 从function_data.get获取参数: {arguments}")
+            
+            # 解析JSON参数
+            print(f"🔧 解析JSON参数...")
+            if isinstance(arguments, str):
+                import json
+                try:
+                    args = json.loads(arguments)
+                    print(f"✅ JSON解析成功: {args}")
+                except json.JSONDecodeError as je:
+                    print(f"❌ JSON解析失败: {je}")
+                    return ToolResult(
+                        tool_call_id=tool_call_id,
+                        content=f"参数格式错误: {str(je)}"
+                    )
+            else:
+                args = arguments
+                print(f"✅ 参数已经是字典格式: {args}")
+            
+            query = args.get('query', '')
+            max_results = args.get('max_results', 3)
+            
+            print(f"📝 解析结果: query='{query}', max_results={max_results}")
+            
+            if not query:
+                print(f"❌ 搜索查询为空")
+                return ToolResult(
+                    tool_call_id=tool_call_id,
+                    content="搜索查询不能为空"
+                )
+            
+            # 检查依赖
+            print(f"🔧 检查RAG服务依赖...")
+            try:
+                from .rag_service import CachedLangChainRAG
+                print(f"✅ RAG服务导入成功")
+            except ImportError as ie:
+                print(f"❌ RAG服务导入失败: {ie}")
+                return ToolResult(
+                    tool_call_id=tool_call_id,
+                    content=f"RAG服务不可用: {str(ie)}"
+                )
+            
+            # 创建RAG服务实例
+            print(f"🔧 创建RAG服务实例...")
+            try:
+                rag_service = CachedLangChainRAG()
+                print(f"✅ RAG服务实例创建成功")
+            except Exception as e:
+                print(f"❌ RAG服务实例创建失败: {e}")
+                import traceback
+                traceback.print_exc()
+                return ToolResult(
+                    tool_call_id=tool_call_id,
+                    content=f"RAG服务初始化失败: {str(e)}"
+                )
+            
+            # 根据聊天类型确定租户ID
+            tenant_id = self.product_id if self.chat_type == 'product' else None
+            print(f"🏢 租户ID: {tenant_id}")
+            
+            # 获取增强上下文
+            print(f"🔍 开始知识库搜索...")
+            try:
+                context = rag_service.get_enhanced_context(query, max_results, tenant_id)
+                print(f"✅ 知识库搜索成功，上下文长度: {len(context)}")
+                return ToolResult(
+                    tool_call_id=tool_call_id,
+                    content=context
+                )
+            except Exception as e:
+                print(f"❌ 知识库搜索失败: {e}")
+                import traceback
+                traceback.print_exc()
+                return ToolResult(
+                    tool_call_id=tool_call_id,
+                    content=f"知识库搜索执行失败: {str(e)}"
+                )
+            
+        except Exception as e:
+            print(f"❌ 知识库搜索工具执行异常: {e}")
+            import traceback
+            traceback.print_exc()
+            return ToolResult(
+                tool_call_id=tool_call_id,
+                content=f"知识库搜索工具异常: {str(e)}"
+            )
+    
    def chat(self, user_message: str) -> str:
        """发送消息给AI并获取回复
        
--- a/api/products/management/commands/chat.py
+++ b/api/products/management/commands/chat.py
@ -132,6 +132,7 @@ class Command(BaseCommand):
        self.stdout.write("输入 'help' 查看帮助信息")
        self.stdout.write("输入 'history' 查看对话历史")
        self.stdout.write("输入 'clear' 清空对话历史")
+        self.stdout.write("输入 'test-rag' 测试知识库搜索功能")
        self.stdout.write("输入 'quit' 或 'exit' 退出系统")
        self.stdout.write("=" * 50)
        self.stdout.write("💡 提示: 使用 ↑↓ 键浏览历史，Tab 键补全，Ctrl+A/E 跳转行首/尾")
@ -189,6 +190,9 @@ class Command(BaseCommand):
                elif user_input.lower() == 'stats':
                    self._print_readline_stats(readline_config)
                    continue
+                elif user_input.lower() == 'test-rag':
+                    self._test_rag_function(ai_service)
+                    continue
                
                # 发送消息给AI
                self.stdout.write("🤖 AI助手正在思考...")
@ -225,6 +229,7 @@ class Command(BaseCommand):
 功能特性:
 - 智能客服问答
 - 二维码扫描验证（输入 '扫描' 或 '验证' 触发）
+- 知识库检索（RAG功能）
 - 防伪平台信息咨询
 - 多轮对话支持
 - 持久化输入历史记录
@ -301,3 +306,35 @@ Ctrl+D       - 退出 (EOF)
            self.stdout.write("历史文件: 无法获取信息")
        
        self.stdout.write("=" * 30)
+    
+    def _test_rag_function(self, ai_service):
+        """测试RAG知识库搜索功能"""
+        self.stdout.write("\n🧪 测试RAG知识库搜索功能")
+        self.stdout.write("=" * 50)
+        
+        try:
+            # 测试查询
+            test_queries = [
+                "徵象平台有什么功能？",
+                "如何验证产品真伪？",
+                "防伪技术原理是什么？"
+            ]
+            
+            for i, query in enumerate(test_queries, 1):
+                self.stdout.write(f"\n🔍 测试查询 {i}: {query}")
+                self.stdout.write("-" * 30)
+                
+                try:
+                    response = ai_service.chat(query)
+                    self.stdout.write(f"✅ AI回复: {response[:200]}...")
+                except Exception as e:
+                    self.stdout.write(f"❌ 查询失败: {str(e)}")
+            
+            self.stdout.write("\n" + "=" * 50)
+            self.stdout.write("🎯 RAG功能测试完成！")
+            self.stdout.write("💡 提示: 如果AI使用了知识库搜索工具，说明RAG功能正常工作")
+            
+        except Exception as e:
+            self.stderr.write(f"❌ RAG测试失败: {str(e)}")
+            import traceback
+            traceback.print_exc()
--- a/api/products/rag_service.py
+++ b/api/products/rag_service.py
@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+"""
+RAG Service Module for 徵象防伪验证平台
+基于 LangChain 的实时知识库检索服务，支持缓存优化
+"""
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.schema import Document
+from .models import Article
+from django.core.cache import cache
+from typing import List, Dict, Any
+import hashlib
+import pickle
+import time
+from bs4 import BeautifulSoup
+
+class CachedLangChainRAG:
+    """基于缓存的实时LangChain RAG服务"""
+    
+    def __init__(self):
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="shibing624/text2vec-base-chinese",
+            model_kwargs={'device': 'cpu'},
+            encode_kwargs={'normalize_embeddings': True}
+        )
+        
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            separators=["\n\n", "\n", "。", "！", "？", ".", "!", "?", "；", ";"]
+        )
+        
+        # 缓存配置
+        self.vector_cache_timeout = 86400 * 30 # 30天
+        self.search_cache_timeout = 86400 # 1天
+    
+    def _get_article_content_hash(self, article: Article) -> str:
+        """获取文章内容的hash值"""
+        content = f"{article.title or ''}{article.body}{article.options or ''}"
+        return hashlib.md5(content.encode('utf-8')).hexdigest()
+    
+    def _get_articles_signature(self, articles: List[Article]) -> str:
+        """获取文章集合的签名"""
+        signatures = []
+        for article in articles:
+            content_hash = self._get_article_content_hash(article)
+            signatures.append(f"{article.id}_{content_hash}")
+        
+        # 排序确保一致性
+        signatures.sort()
+        combined = '|'.join(signatures)
+        return hashlib.md5(combined.encode('utf-8')).hexdigest()
+    
+    def search(self, query: str, max_results: int = 3, tenant_id: int = None) -> List[Dict[str, Any]]:
+        """带缓存的搜索"""
+        start_time = time.time()
+        print(f"\n🔍 RAG搜索开始: {query}")
+        print(f"📊 参数: max_results={max_results}, tenant_id={tenant_id}")
+        
+        # 生成查询缓存key
+        query_hash = hashlib.md5(query.encode('utf-8')).hexdigest()
+        cache_key = f"rag_search_{query_hash}_{max_results}_{tenant_id}"
+        
+        # 尝试从缓存获取搜索结果
+        cache_check_start = time.time()
+        cached_result = cache.get(cache_key)
+        cache_check_time = time.time() - cache_check_start
+        
+        if cached_result:
+            total_time = time.time() - start_time
+            print(f"✅ 缓存命中: {cache_key}")
+            print(f"📚 返回缓存结果: {len(cached_result)} 条")
+            print(f"⏱️ 缓存检查耗时: {cache_check_time:.3f}秒")
+            print(f"⏱️ 总耗时: {total_time:.3f}秒")
+            return cached_result
+        
+        print(f"❌ 缓存未命中: {cache_key}")
+        print(f"⏱️ 缓存检查耗时: {cache_check_time:.3f}秒")
+        print("🔄 执行实时搜索...")
+        
+        # 缓存未命中，执行搜索
+        search_start_time = time.time()
+        results = self._perform_search(query, max_results, tenant_id)
+        search_time = time.time() - search_start_time
+        
+        # 缓存搜索结果
+        cache_start_time = time.time()
+        cache.set(cache_key, results, self.search_cache_timeout)
+        cache_time = time.time() - cache_start_time
+        
+        total_time = time.time() - start_time
+        print(f"💾 搜索结果已缓存，过期时间: {self.search_cache_timeout}秒")
+        print(f"⏱️ 搜索耗时: {search_time:.3f}秒")
+        print(f"⏱️ 缓存写入耗时: {cache_time:.3f}秒")
+        print(f"⏱️ 总耗时: {total_time:.3f}秒")
+        
+        return results
+    
+    def _perform_search(self, query: str, max_results: int, tenant_id: int = None) -> List[Dict[str, Any]]:
+        """执行实际搜索"""
+        search_start_time = time.time()
+        try:
+            print(f"🔧 获取向量存储...")
+            vector_start_time = time.time()
+            # 获取或创建缓存的向量存储
+            vectorstore = self._get_cached_vectorstore(tenant_id)
+            vector_time = time.time() - vector_start_time
+            print(f"⏱️ 向量存储获取耗时: {vector_time:.3f}秒")
+            
+            if not vectorstore:
+                print("❌ 无法获取向量存储")
+                return []
+            
+            print(f"🎯 执行相似度搜索，查询: '{query}'")
+            similarity_start_time = time.time()
+            # 执行相似度搜索
+            docs = vectorstore.similarity_search_with_score(query, k=max_results)
+            similarity_time = time.time() - similarity_start_time
+            print(f"⏱️ 相似度搜索耗时: {similarity_time:.3f}秒")
+            print(f"📊 找到 {len(docs)} 个候选文档")
+            
+            # 格式化结果
+            format_start_time = time.time()
+            results = []
+            for i, (doc, score) in enumerate(docs, 1):
+                result = {
+                    'id': doc.metadata.get('article_id'),
+                    'title': doc.metadata.get('title'),
+                    'content': doc.page_content,
+                    'score': float(score),
+                    'tenant_id': doc.metadata.get('tenant_id'),
+                    'url': doc.metadata.get('url')
+                }
+                results.append(result)
+                
+                print(f"  📄 结果 {i}:")
+                print(f"     ID: {result['id']}")
+                print(f"     标题: {result['title']}")
+                print(f"     相关度: {result['score']:.4f}")
+                print(f"     内容预览: {result['content'][:100]}...")
+            
+            format_time = time.time() - format_start_time
+            total_search_time = time.time() - search_start_time
+            print(f"⏱️ 结果格式化耗时: {format_time:.3f}秒")
+            print(f"✅ 搜索完成，返回 {len(results)} 个结果")
+            print(f"⏱️ 搜索总耗时: {total_search_time:.3f}秒")
+            return results
+            
+        except Exception as e:
+            total_search_time = time.time() - search_start_time
+            print(f"❌ RAG搜索失败: {str(e)}")
+            print(f"⏱️ 搜索耗时: {total_search_time:.3f}秒")
+            import traceback
+            traceback.print_exc()
+            return []
+    
+    def _get_cached_vectorstore(self, tenant_id: int = None):
+        """获取或创建缓存的向量存储"""
+        vector_start_time = time.time()
+        print(f"🔍 获取知识库文章...")
+        
+        articles_start_time = time.time()
+        # 获取知识库文章列表
+        articles = self._get_knowledge_base_articles(tenant_id)
+        articles_time = time.time() - articles_start_time
+        print(f"⏱️ 文章获取耗时: {articles_time:.3f}秒")
+        
+        if not articles:
+            print("❌ 没有找到知识库文章")
+            return None
+        
+        print(f"📚 找到 {len(articles)} 篇知识库文章")
+        
+        # 检查向量缓存
+        cache_check_start = time.time()
+        cache_key = self._get_vector_cache_key(articles, tenant_id)
+        print(f"🔑 向量缓存key: {cache_key}")
+        
+        cached_vectors = cache.get(cache_key)
+        cache_check_time = time.time() - cache_check_start
+        print(f"⏱️ 向量缓存检查耗时: {cache_check_time:.3f}秒")
+        
+        if cached_vectors:
+            restore_start_time = time.time()
+            print("✅ 向量缓存命中，恢复向量存储")
+            # 从缓存恢复向量存储
+            result = self._restore_vectorstore(cached_vectors)
+            restore_time = time.time() - restore_start_time
+            print(f"⏱️ 向量存储恢复耗时: {restore_time:.3f}秒")
+            
+            total_time = time.time() - vector_start_time
+            print(f"⏱️ 向量存储获取总耗时: {total_time:.3f}秒")
+            return result
+        
+        print("❌ 向量缓存未命中，创建新的向量存储")
+        create_start_time = time.time()
+        # 缓存未命中，创建新的向量存储
+        vectorstore = self._create_vectorstore(articles)
+        create_time = time.time() - create_start_time
+        print(f"⏱️ 向量存储创建耗时: {create_time:.3f}秒")
+        
+        # 缓存向量存储
+        cache_start_time = time.time()
+        self._cache_vectorstore(vectorstore, cache_key)
+        cache_time = time.time() - cache_start_time
+        print(f"⏱️ 向量存储缓存耗时: {cache_time:.3f}秒")
+        
+        total_time = time.time() - vector_start_time
+        print(f"⏱️ 向量存储获取总耗时: {total_time:.3f}秒")
+        return vectorstore
+    
+    def _get_knowledge_base_articles(self, tenant_id: int = None) -> List[Article]:
+        """获取知识库文章（带缓存）"""
+        articles_start_time = time.time()
+        
+        # 生成基于文章内容的缓存key
+        cache_key = self._get_articles_cache_key(tenant_id)
+        print(f"🔍 检查文章缓存: {cache_key}")
+        
+        cache_check_start = time.time()
+        cached_articles = cache.get(cache_key)
+        cache_check_time = time.time() - cache_check_start
+        
+        if cached_articles:
+            total_time = time.time() - articles_start_time
+            print(f"✅ 文章缓存命中，返回 {len(cached_articles)} 篇文章")
+            print(f"⏱️ 缓存检查耗时: {cache_check_time:.3f}秒")
+            print(f"⏱️ 文章获取总耗时: {total_time:.3f}秒")
+            return cached_articles
+        
+        print("❌ 文章缓存未命中，查询数据库...")
+        print(f"⏱️ 缓存检查耗时: {cache_check_time:.3f}秒")
+        
+        # 查询数据库
+        db_start_time = time.time()
+        filter_kwargs = {'is_platform_knowledge_base': True}
+        if tenant_id:
+            filter_kwargs['tenant_id'] = tenant_id
+        
+        articles = list(Article.objects.filter(**filter_kwargs))
+        db_time = time.time() - db_start_time
+        print(f"📊 数据库查询结果: {len(articles)} 篇文章")
+        print(f"⏱️ 数据库查询耗时: {db_time:.3f}秒")
+        
+        # 缓存文章列表
+        cache_start_time = time.time()
+        cache.set(cache_key, articles, self.vector_cache_timeout)
+        cache_time = time.time() - cache_start_time
+        print(f"💾 文章列表已缓存，过期时间: {self.vector_cache_timeout}秒")
+        print(f"⏱️ 文章缓存写入耗时: {cache_time:.3f}秒")
+        
+        total_time = time.time() - articles_start_time
+        print(f"⏱️ 文章获取总耗时: {total_time:.3f}秒")
+        return articles
+    
+    def _get_articles_cache_key(self, tenant_id: int = None) -> str:
+        """生成基于文章内容的缓存key"""
+        # 查询当前文章状态
+        filter_kwargs = {'is_platform_knowledge_base': True}
+        if tenant_id:
+            filter_kwargs['tenant_id'] = tenant_id
+        
+        articles = Article.objects.filter(**filter_kwargs)
+        
+        # 生成文章内容哈希
+        content_signature = self._get_articles_content_signature(articles)
+        
+        # 缓存key包含租户ID和内容签名
+        cache_key = f"rag_kb_articles_{tenant_id}_{content_signature}"
+        print(f"🔑 生成文章缓存key: {cache_key}")
+        print(f"📊 内容签名: {content_signature}")
+        
+        return cache_key
+    
+    def _get_articles_content_signature(self, articles) -> str:
+        """生成文章内容的签名"""
+        signatures = []
+        
+        for article in articles:
+            # 基于文章ID、标题、完整内容、选项等生成签名
+            title = article.title or ''
+            body = article.body or ''
+            options = article.options or ''
+            
+            # 组合签名要素 - 直接哈希完整内容
+            article_signature = f"{article.id}_{title}_{body}_{options}"
+            signatures.append(hashlib.md5(article_signature.encode('utf-8')).hexdigest())
+        
+        # 排序确保一致性
+        signatures.sort()
+        combined = '|'.join(signatures)
+        
+        # 生成最终签名
+        final_signature = hashlib.md5(combined.encode('utf-8')).hexdigest()
+        return final_signature
+    
+    def _clean_html_content(self, html_content: str) -> str:
+        """清理HTML内容，提取纯文本"""
+        if not html_content:
+            return ""
+        
+        # 使用BeautifulSoup解析HTML
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # 移除script、style、head、title、meta、link等标签
+        for tag in soup(["script", "style", "head", "title", "meta", "link", "noscript"]):
+            tag.decompose()
+        
+        # 获取纯文本
+        text = soup.get_text()
+        
+        # 清理空白字符：移除多余的空行和空格
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        
+        return text
+    
+    def _get_vector_cache_key(self, articles: List[Article], tenant_id: int = None) -> str:
+        """生成向量缓存key"""
+        # 基于文章集合签名生成缓存key
+        signature = self._get_articles_signature(articles)
+        cache_key = f"rag_vectors_{tenant_id}_{signature}"
+        print(f"🔑 生成向量缓存key: {cache_key}")
+        return cache_key
+    
+    def _create_vectorstore(self, articles: List[Article]):
+        """创建向量存储"""
+        create_start_time = time.time()
+        print(f"🔧 创建文档对象...")
+        
+        docs_start_time = time.time()
+        documents = self._create_documents(articles)
+        docs_time = time.time() - docs_start_time
+        print(f"⏱️ 文档创建耗时: {docs_time:.3f}秒")
+        
+        if not documents:
+            print("❌ 无法创建文档对象")
+            return None
+        
+        print(f"📄 创建了 {len(documents)} 个文档片段")
+        print(f"🎯 开始向量化...")
+        
+        embedding_start_time = time.time()
+        vectorstore = FAISS.from_documents(
+            documents=documents,
+            embedding=self.embeddings
+        )
+        embedding_time = time.time() - embedding_start_time
+        print(f"⏱️ 向量化耗时: {embedding_time:.3f}秒")
+        
+        total_time = time.time() - create_start_time
+        print(f"✅ 向量存储创建完成，包含 {len(documents)} 个向量")
+        print(f"⏱️ 向量存储创建总耗时: {total_time:.3f}秒")
+        return vectorstore
+    
+    def _create_documents(self, articles: List[Article]) -> List[Document]:
+        """创建文档对象"""
+        docs_start_time = time.time()
+        documents = []
+        
+        for article in articles:
+            try:
+                article_start_time = time.time()
+                print(f"  📝 处理文章: {article.title or '无标题'} (ID: {article.id})")
+                
+                # 清理HTML内容
+                clean_start_time = time.time()
+                clean_body = self._clean_html_content(article.body)
+                clean_time = time.time() - clean_start_time
+                print(f"     🧹 HTML清理耗时: {clean_time:.3f}秒")
+                
+                split_start_time = time.time()
+                chunks = self.text_splitter.split_text(clean_body)
+                split_time = time.time() - split_start_time
+                print(f"     ✂️ 分割为 {len(chunks)} 个片段，耗时: {split_time:.3f}秒")
+                
+                valid_chunks = 0
+                for i, chunk in enumerate(chunks):
+                    if len(chunk.strip()) < 50:
+                        continue
+                        
+                    doc = Document(
+                        page_content=chunk,
+                        metadata={
+                            'article_id': article.id,
+                            'title': article.title or '无标题',
+                            'tenant_id': article.tenant_id,
+                            'chunk_index': i,
+                            'url': article.url,
+                            'source': 'platform_knowledge_base',
+                            'html_cleaned': True
+                        }
+                    )
+                    documents.append(doc)
+                    valid_chunks += 1
+                
+                article_time = time.time() - article_start_time
+                print(f"     ✅ 有效片段: {valid_chunks}/{len(chunks)}，文章处理耗时: {article_time:.3f}秒")
+                    
+            except Exception as e:
+                print(f"❌ 处理文章 {article.id} 失败: {str(e)}")
+                continue
+        
+        total_time = time.time() - docs_start_time
+        print(f"📊 总共创建了 {len(documents)} 个有效文档片段")
+        print(f"⏱️ 文档创建总耗时: {total_time:.3f}秒")
+        return documents
+    
+    def _cache_vectorstore(self, vectorstore, cache_key: str):
+        """缓存向量存储"""
+        try:
+            print(f"💾 序列化向量存储...")
+            serialize_start_time = time.time()
+            # 序列化向量存储
+            serialized_data = pickle.dumps(vectorstore)
+            serialize_time = time.time() - serialize_start_time
+            print(f"📦 序列化完成，大小: {len(serialized_data)} 字节，耗时: {serialize_time:.3f}秒")
+            
+            cache_start_time = time.time()
+            cache.set(cache_key, serialized_data, self.vector_cache_timeout)
+            cache_time = time.time() - cache_start_time
+            print(f"✅ 向量存储已缓存，过期时间: {self.vector_cache_timeout}秒，缓存写入耗时: {cache_time:.3f}秒")
+        except Exception as e:
+            print(f"❌ 缓存向量存储失败: {str(e)}")
+            import traceback
+            traceback.print_exc()
+    
+    def _restore_vectorstore(self, serialized_data: bytes):
+        """从缓存恢复向量存储"""
+        try:
+            restore_start_time = time.time()
+            result = pickle.loads(serialized_data)
+            restore_time = time.time() - restore_start_time
+            print(f"⏱️ 向量存储反序列化耗时: {restore_time:.3f}秒")
+            return result
+        except Exception as e:
+            print(f"❌ 恢复向量存储失败: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return None
+    
+    def get_enhanced_context(self, query: str, max_results: int = 3, tenant_id: int = None) -> str:
+        """获取增强的上下文信息"""
+        context_start_time = time.time()
+        print(f"\n📖 开始构建增强上下文...")
+        
+        results = self.search(query, max_results, tenant_id)
+        
+        if not results:
+            total_time = time.time() - context_start_time
+            print(f"⏱️ 上下文构建耗时: {total_time:.3f}秒")
+            return "未找到相关知识库内容。"
+        
+        # 构建上下文
+        build_start_time = time.time()
+        context = "基于知识库检索到的相关信息：\n\n"
+        for i, result in enumerate(results, 1):
+            context += f"【{i}】{result['title']}\n"
+            context += f"相关度：{result['score']:.3f}\n"
+            context += f"内容：{result['content']}\n\n"
+        
+        build_time = time.time() - build_start_time
+        total_time = time.time() - context_start_time
+        print(f"⏱️ 上下文构建耗时: {build_time:.3f}秒")
+        print(f"⏱️ 上下文构建总耗时: {total_time:.3f}秒")
+        
+        return context
+    
+    def clear_old_cache(self, tenant_id: int = None):
+        """清理旧的缓存，避免缓存碎片"""
+        try:
+            # 清理旧的搜索缓存
+            old_search_pattern = f"rag_search_*_{tenant_id}_*"
+            self._clear_cache_by_pattern(old_search_pattern)
+            
+            # 清理旧的向量缓存
+            old_vector_pattern = f"rag_vectors_{tenant_id}_*"
+            self._clear_cache_by_pattern(old_vector_pattern)
+            
+            print(f"🧹 已清理租户 {tenant_id} 的旧缓存")
+        except Exception as e:
+            print(f"⚠️ 清理旧缓存失败: {e}")
+    
+    def _clear_cache_by_pattern(self, pattern: str):
+        """根据模式清理缓存（Django缓存限制，这里只是标记）"""
+        # 注意：Django的默认缓存后端不支持模式删除
+        # 这里标记需要清理的缓存，实际清理在下次访问时处理
+        cache.set(f"rag_cache_invalidated_{pattern}", time.time(), 300)  # 5分钟标记
+    
+    def _is_cache_invalidated(self, cache_key: str) -> bool:
+        """检查缓存是否被标记为失效"""
+        # 检查是否有相关的失效标记
+        for pattern in ["rag_search_", "rag_vectors_", "rag_kb_articles_"]:
+            if pattern in cache_key:
+                invalidated_key = f"rag_cache_invalidated_{pattern}*"
+                if cache.get(invalidated_key):
+                    return True
+        return False
--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -8,3 +8,10 @@ requests
 oss2
 aliyun-python-sdk-core-v3
 pillow
+
+# RAG dependencies
+langchain>=0.1.0
+langchain-community>=0.1.0
+sentence-transformers>=2.2.0
+faiss-cpu>=1.7.4
+beautifulsoup4>=4.12.0