[{"data":1,"prerenderedAt":798},["ShallowReactive",2],{"header-counts":3,"footer-counts":6,"wiki-rag":9},{"tools":4,"reviews":5},65,7,{"tools":4,"reviews":5,"playbooks":7,"news":8},10,8,{"id":10,"title":11,"body":12,"category":778,"description":80,"extension":779,"meta":780,"navigation":781,"path":782,"published":783,"relatedModels":784,"relatedTools":785,"seo":789,"slug":790,"stem":791,"summary":792,"tags":793,"updated":783,"__hash__":797},"wiki\u002Fwiki\u002Frag.md","RAG (检索增强生成)",{"type":13,"value":14,"toc":752},"minimark",[15,20,24,32,35,38,64,67,70,81,84,88,91,180,184,192,250,253,256,277,280,284,288,291,295,298,324,328,331,342,346,353,359,362,366,369,375,378,382,385,389,392,475,478,498,504,508,511,557,561,564,606,610,642,646,716,726,729],[16,17,19],"h2",{"id":18},"什么是-rag","什么是 RAG",[21,22,23],"p",{},"RAG（Retrieval-Augmented Generation，检索增强生成）是一种 AI 架构模式：在让大模型生成回答之前，先从外部知识库中检索相关信息，把检索到的内容作为上下文喂给模型。",[21,25,26,27,31],{},"简单说就是：",[28,29,30],"strong",{},"先查资料，再回答","。",[16,33,34],{"id":34},"解决什么问题",[21,36,37],{},"大模型有三个固有缺陷，RAG 可以缓解：",[39,40,41,48,58],"ol",{},[42,43,44,47],"li",{},[28,45,46],{},"知识截止"," — 模型训练数据有截止日期，不知道最新信息",[42,49,50,57],{},[28,51,52],{},[53,54,56],"a",{"href":55},"\u002Fwiki\u002Fhallucination.html","幻觉"," — 模型会一本正经地编造不存在的信息",[42,59,60,63],{},[28,61,62],{},"领域知识不足"," — 通用模型不了解企业内部文档、私有数据",[21,65,66],{},"RAG 通过引入外部知识库，让模型基于真实文档回答，大幅降低幻觉率。",[16,68,69],{"id":69},"工作流程",[71,72,77],"pre",{"className":73,"code":75,"language":76},[74],"language-text","用户提问\n  ↓\n① 查询向量化 — 把问题转成向量\n  ↓\n② 向量检索 — 从知识库找最相关的文档片段\n  ↓\n③ 组装上下文 — 问题 + 检索到的文档片段 → 组合 prompt\n  ↓\n④ LLM 生成 — 大模型基于上下文生成回答\n  ↓\n⑤ 引用标注 — 标注回答来源（哪个文档的哪一段）\n","text",[78,79,75],"code",{"__ignoreMap":80},"",[16,82,83],{"id":83},"关键组件",[85,86,87],"h3",{"id":87},"向量数据库",[21,89,90],{},"存储文档的向量表示，支持相似度检索：",[92,93,94,110],"table",{},[95,96,97],"thead",{},[98,99,100,104,107],"tr",{},[101,102,103],"th",{},"向量库",[101,105,106],{},"特点",[101,108,109],{},"适用场景",[111,112,113,125,136,147,158,169],"tbody",{},[98,114,115,119,122],{},[116,117,118],"td",{},"Chroma",[116,120,121],{},"轻量、Python 原生",[116,123,124],{},"原型开发",[98,126,127,130,133],{},[116,128,129],{},"Qdrant",[116,131,132],{},"Rust 高性能、支持过滤",[116,134,135],{},"生产环境",[98,137,138,141,144],{},[116,139,140],{},"Milvus",[116,142,143],{},"分布式、大规模",[116,145,146],{},"企业级",[98,148,149,152,155],{},[116,150,151],{},"Pinecone",[116,153,154],{},"托管 SaaS",[116,156,157],{},"免运维",[98,159,160,163,166],{},[116,161,162],{},"pgvector",[116,164,165],{},"PostgreSQL 扩展",[116,167,168],{},"已有 PG 的项目",[98,170,171,174,177],{},[116,172,173],{},"SQLite-VSS \u002F libsql",[116,175,176],{},"SQLite 扩展",[116,178,179],{},"轻量部署",[85,181,183],{"id":182},"embedding-模型","Embedding 模型",[21,185,186,187,191],{},"把文本转成向量，详见 ",[53,188,190],{"href":189},"\u002Fwiki\u002Fembedding.html","Embedding","：",[92,193,194,206],{},[95,195,196],{},[98,197,198,201,204],{},[101,199,200],{},"模型",[101,202,203],{},"维度",[101,205,106],{},[111,207,208,219,230,240],{},[98,209,210,213,216],{},[116,211,212],{},"OpenAI text-embedding-3",[116,214,215],{},"1536\u002F3072",[116,217,218],{},"通用、稳定",[98,220,221,224,227],{},[116,222,223],{},"BGE-large-zh",[116,225,226],{},"1024",[116,228,229],{},"中文效果好",[98,231,232,235,237],{},[116,233,234],{},"Jina Embeddings v3",[116,236,226],{},[116,238,239],{},"多语言",[98,241,242,245,247],{},[116,243,244],{},"Cohere embed v4",[116,246,226],{},[116,248,249],{},"多语言+多模态",[85,251,252],{"id":252},"文档分块",[21,254,255],{},"把长文档切成小块（chunk）是 RAG 的关键步骤：",[257,258,259,265,271],"ul",{},[42,260,261,264],{},[28,262,263],{},"固定大小分块"," — 每 500-1000 token 一块，简单粗暴",[42,266,267,270],{},[28,268,269],{},"语义分块"," — 按段落\u002F章节自然边界切分",[42,272,273,276],{},[28,274,275],{},"滑动窗口"," — 相邻块有重叠（如 200 token），避免切断上下文",[21,278,279],{},"分块太短 → 检索准但上下文不足\n分块太长 → 上下文全但检索精度低",[16,281,283],{"id":282},"rag-进阶模式","RAG 进阶模式",[85,285,287],{"id":286},"朴素-rag","朴素 RAG",[21,289,290],{},"基础流程：query → embed → search → stuff → generate。简单但不够精准。",[85,292,294],{"id":293},"高级-rag","高级 RAG",[21,296,297],{},"在基础流程上增加优化：",[257,299,300,306,312,318],{},[42,301,302,305],{},[28,303,304],{},"查询改写（Query Rewriting）"," — LLM 先把用户问题改写成更适合检索的形式",[42,307,308,311],{},[28,309,310],{},"重排序（Reranking）"," — 向量检索后用 Cross-Encoder 重排序",[42,313,314,317],{},[28,315,316],{},"多路召回（Hybrid Search）"," — 同时用向量检索 + BM25 关键词检索",[42,319,320,323],{},[28,321,322],{},"上下文压缩"," — 检索后先摘要再喂给 LLM",[85,325,327],{"id":326},"late-chunking","Late Chunking",[21,329,330],{},"传统做法：先切块再 embedding。问题是每个 chunk 失去全文上下文。",[21,332,333,334,337,338,341],{},"Late Chunking：先 embedding 整篇文档，",[28,335,336],{},"在 token-level embedding 输出后再做 pooling 切块","。每个 chunk 的向量保留了全文语境。Jina v3、BGE-M3 等模型支持。对",[28,339,340],{},"长文档 \u002F 需要跨段上下文","的场景效果显著提升。",[85,343,345],{"id":344},"hydehypothetical-document-embeddings","HyDE（Hypothetical Document Embeddings）",[21,347,348,349,352],{},"思路：用户的问题很短、检索时和长文档难匹配。先让 LLM 基于问题\"假装回答\"一段，",[28,350,351],{},"用假回答去检索","——假回答和真文档语义近，召回率显著提升。",[71,354,357],{"className":355,"code":356,"language":76},[74],"用户问题 → LLM 生成假设答案 → embed 假答案 → 检索 → 真答案\n",[78,358,356],{"__ignoreMap":80},[21,360,361],{},"代价：多一次 LLM 调用 + 多一次 embedding。适合检索难命中的专业场景。",[85,363,365],{"id":364},"multi-query-expansion","Multi-Query Expansion",[21,367,368],{},"让 LLM 把一个问题展开成多个角度的子问题，并行检索后融合：",[71,370,373],{"className":371,"code":372,"language":76},[74],"\"我们的退货政策\" →\n  ├─ \"退货流程\"\n  ├─ \"退款时效\"\n  └─ \"哪些商品不支持退货\"\n→ 三路检索 → 合并去重 → 喂给 LLM\n",[78,374,372],{"__ignoreMap":80},[21,376,377],{},"适合用户问题模糊、单次检索覆盖不全的情况。",[85,379,381],{"id":380},"模块化-rag","模块化 RAG",[21,383,384],{},"把 RAG 拆成可替换的模块：检索、路由、融合、排序、生成。每一步都可以独立优化。",[16,386,388],{"id":387},"评估指标怎么知道-rag-做得好不好","评估指标：怎么知道 RAG 做得好不好",[21,390,391],{},"不评估的 RAG 是黑盒。主流指标：",[92,393,394,407],{},[95,395,396],{},[98,397,398,401,404],{},[101,399,400],{},"指标",[101,402,403],{},"衡量什么",[101,405,406],{},"怎么算",[111,408,409,422,436,449,462],{},[98,410,411,416,419],{},[116,412,413],{},[28,414,415],{},"Recall@K",[116,417,418],{},"检索召回率",[116,420,421],{},"top-K 检索结果中有多少真包含答案",[98,423,424,430,433],{},[116,425,426,429],{},[28,427,428],{},"MRR","（Mean Reciprocal Rank）",[116,431,432],{},"第一个相关结果排第几",[116,434,435],{},"1\u002Frank 的平均值",[98,437,438,443,446],{},[116,439,440],{},[28,441,442],{},"Faithfulness",[116,444,445],{},"答案是否忠于检索内容",[116,447,448],{},"LLM-as-judge 逐句核对",[98,450,451,456,459],{},[116,452,453],{},[28,454,455],{},"Answer Relevance",[116,457,458],{},"答案是否回应了问题",[116,460,461],{},"LLM-as-judge 打分",[98,463,464,469,472],{},[116,465,466],{},[28,467,468],{},"Context Precision",[116,470,471],{},"检索内容有多大比例真用上了",[116,473,474],{},"看 LLM 生成里引用了哪些",[21,476,477],{},"主流工具：",[257,479,480,486,492],{},[42,481,482,485],{},[28,483,484],{},"RAGAS"," — 开源 RAG 评测框架，前 4 个指标都有内置",[42,487,488,491],{},[28,489,490],{},"TruLens"," — RAG 三角评估（context relevance \u002F groundedness \u002F answer relevance）",[42,493,494,497],{},[28,495,496],{},"DeepEval"," — pytest 风格的 LLM 评测",[21,499,500,503],{},[28,501,502],{},"经验","：先打 100 条 golden case + 期望答案，跑 RAGAS 出基线。每次改 chunking \u002F embed model \u002F reranker 都跑一遍对比，别凭感觉。",[16,505,507],{"id":506},"naive-rag-的常见失败案例","Naive RAG 的常见失败案例",[21,509,510],{},"知道怎么坏才能改对：",[39,512,513,519,529,539,545,551],{},[42,514,515,518],{},[28,516,517],{},"问题用代词，检索丢上下文","——「它的价格是多少？」「它」指什么？解决：检索前用 LLM 改写带历史的查询。",[42,520,521,524,525,528],{},[28,522,523],{},"关键词命中但语义跑偏","——搜「苹果」匹配到水果商品而不是苹果公司财报。解决：",[53,526,527],{"href":189},"混合检索"," BM25 + 向量。",[42,530,531,534,535,31],{},[28,532,533],{},"检索到了但被中间遗忘","——top-10 太长，关键文档被夹中段被忽略。解决：reranker 重排把最相关放首尾，参考 ",[53,536,538],{"href":537},"\u002Fwiki\u002Fcontext-engineering.html","Context Engineering",[42,540,541,544],{},[28,542,543],{},"空召回硬答","——知识库里压根没有，模型不老实拒答反而幻觉。解决：检索分数低于阈值直接回退到\"未找到相关资料\"。",[42,546,547,550],{},[28,548,549],{},"多文档冲突","——A 文档说 X，B 文档说 not X，模型选一个自圆其说。解决：让 LLM 显式标注冲突 + 让用户决策。",[42,552,553,556],{},[28,554,555],{},"更新延迟","——文档更新了但索引没重建。解决：建立增量索引 pipeline + 版本号校验。",[16,558,560],{"id":559},"生产-rag-的监控清单","生产 RAG 的监控清单",[21,562,563],{},"上线后要持续盯的信号：",[257,565,566,572,578,584,590,596],{},[42,567,568,571],{},[28,569,570],{},"检索分数分布","——突然下移说明 query 模式变了",[42,573,574,577],{},[28,575,576],{},"空召回率","——多少请求没拿到 top-K 任何文档",[42,579,580,583],{},[28,581,582],{},"延迟分布","——embedding \u002F 向量检索 \u002F rerank \u002F LLM 各阶段 p50\u002Fp95\u002Fp99",[42,585,586,589],{},[28,587,588],{},"答案长度异常","——突然变短可能是模型在拒答，变长可能是幻觉扩写",[42,591,592,595],{},[28,593,594],{},"用户反馈"," 👍\u002F👎 ——把负反馈样本回流到评测集",[42,597,598,601,602],{},[28,599,600],{},"token 成本","——RAG 把 prompt 撑长，成本容易失控，详见 ",[53,603,605],{"href":604},"\u002Fwiki\u002Ftoken.html","Token",[16,607,609],{"id":608},"在哪些工具中用到-rag","在哪些工具中用到 RAG",[257,611,612,618,624,630,636],{},[42,613,614,617],{},[28,615,616],{},"Dify"," — 内置 RAG 流程，上传文档自动建索引",[42,619,620,623],{},[28,621,622],{},"FastGPT"," — 知识库为核心，RAG 优先设计",[42,625,626,629],{},[28,627,628],{},"Coze"," — 知识库功能，支持自动分块",[42,631,632,635],{},[28,633,634],{},"LangChain \u002F LlamaIndex"," — RAG 开发框架",[42,637,638,641],{},[28,639,640],{},"各种客服 Bot \u002F 企业知识助手"," — RAG 是标准架构",[16,643,645],{"id":644},"rag-vs-fine-tuning","RAG vs Fine-tuning",[92,647,648,660],{},[95,649,650],{},[98,651,652,654,657],{},[101,653,203],{},[101,655,656],{},"RAG",[101,658,659],{},"Fine-tuning",[111,661,662,673,684,695,705],{},[98,663,664,667,670],{},[116,665,666],{},"知识更新",[116,668,669],{},"实时（更新文档即可）",[116,671,672],{},"需重新训练",[98,674,675,678,681],{},[116,676,677],{},"计算成本",[116,679,680],{},"低（只需检索+推理）",[116,682,683],{},"高（需要 GPU 训练）",[98,685,686,689,692],{},[116,687,688],{},"可解释性",[116,690,691],{},"高（能标注来源）",[116,693,694],{},"低（黑盒）",[98,696,697,699,702],{},[116,698,109],{},[116,700,701],{},"事实性问答、知识检索",[116,703,704],{},"调整模型风格\u002F格式",[98,706,707,710,713],{},[116,708,709],{},"幻觉控制",[116,711,712],{},"好（有出处）",[116,714,715],{},"一般",[21,717,718,721,722,31],{},[28,719,720],{},"建议","：大多数企业知识问答场景用 RAG，不要 fine-tune。详细对比见 ",[53,723,725],{"href":724},"\u002Fwiki\u002Ffine-tuning-vs-rag.html","Fine-tuning vs RAG",[16,727,728],{"id":728},"延伸阅读",[257,730,731,736,741,747],{},[42,732,733,734],{},"底层基础：",[53,735,190],{"href":189},[42,737,738,739],{},"上下文组装：",[53,740,538],{"href":537},[42,742,743,744],{},"幻觉缓解：",[53,745,746],{"href":55},"Hallucination",[42,748,749,750],{},"与微调对比：",[53,751,725],{"href":724},{"title":80,"searchDepth":753,"depth":753,"links":754},3,[755,757,758,759,764,772,773,774,775,776,777],{"id":18,"depth":756,"text":19},2,{"id":34,"depth":756,"text":34},{"id":69,"depth":756,"text":69},{"id":83,"depth":756,"text":83,"children":760},[761,762,763],{"id":87,"depth":753,"text":87},{"id":182,"depth":753,"text":183},{"id":252,"depth":753,"text":252},{"id":282,"depth":756,"text":283,"children":765},[766,767,768,769,770,771],{"id":286,"depth":753,"text":287},{"id":293,"depth":753,"text":294},{"id":326,"depth":753,"text":327},{"id":344,"depth":753,"text":345},{"id":364,"depth":753,"text":365},{"id":380,"depth":753,"text":381},{"id":387,"depth":756,"text":388},{"id":506,"depth":756,"text":507},{"id":559,"depth":756,"text":560},{"id":608,"depth":756,"text":609},{"id":644,"depth":756,"text":645},{"id":728,"depth":756,"text":728},"architecture","md",{},true,"\u002Fwiki\u002Frag","2026-06-21",null,[786,787,788],"agent\u002Fplatform\u002Fdify","agent\u002Fplatform\u002Ffastgpt","agent\u002Fplatform\u002Fcoze",{"title":11,"description":80},"rag","wiki\u002Frag","Retrieval-Augmented Generation，让 AI 在生成回答前先从知识库检索相关信息，解决大模型知识过时和幻觉问题。",[656,794,795,796],"架构","知识库","向量检索","5XqkMX0BGIsSkQIeRVDiX5NwHBMAsd9bzBk6QMZuWF8",1782316490763]