[{"data":1,"prerenderedAt":684},["ShallowReactive",2],{"header-counts":3,"footer-counts":6,"wiki-hallucination":9},{"tools":4,"reviews":5},65,7,{"tools":4,"reviews":5,"playbooks":7,"news":8},10,8,{"id":10,"title":11,"body":12,"category":666,"description":136,"extension":667,"meta":668,"navigation":669,"path":670,"published":671,"relatedModels":672,"relatedTools":675,"seo":676,"slug":677,"stem":678,"summary":679,"tags":680,"updated":671,"__hash__":683},"wiki\u002Fwiki\u002Fhallucination.md","Hallucination（幻觉）",{"type":13,"value":14,"toc":636},"minimark",[15,19,28,56,59,64,67,77,80,119,122,126,137,141,147,151,157,161,167,170,173,262,269,272,276,279,285,292,303,312,316,323,329,332,336,339,345,348,352,355,361,365,371,375,378,382,389,393,401,404,485,489,492,498,501,570,576,579,582,605,608],[16,17,18],"h2",{"id":18},"什么是幻觉",[20,21,22,23,27],"p",{},"幻觉（Hallucination）是指大模型生成的内容",[24,25,26],"strong",{},"看起来正确但实际上是错误的","。包括：",[29,30,31,38,44,50],"ul",{},[32,33,34,37],"li",{},[24,35,36],{},"事实性幻觉"," — 编造不存在的事实（\"鲁迅于 1950 年获得诺贝尔文学奖\"）",[32,39,40,43],{},[24,41,42],{},"来源性幻觉"," — 虚构引用来源（编造不存在的论文\u002FURL）",[32,45,46,49],{},[24,47,48],{},"能力性幻觉"," — 声称能做做不到的事（\"我可以访问互联网\"）",[32,51,52,55],{},[24,53,54],{},"代码幻觉"," — 调用不存在的 API\u002F函数\u002F库",[16,57,58],{"id":58},"为什么会产生幻觉",[60,61,63],"h3",{"id":62},"根本原因概率生成","根本原因：概率生成",[20,65,66],{},"大模型的本质是\"预测下一个最可能的 token\"。它不是在检索事实，而是在做概率推理。当训练数据中缺乏确切信息时，模型会根据语言模式生成\"看起来合理\"的内容。",[20,68,69,72,73,76],{},[24,70,71],{},"关键认知","：幻觉不是 bug，是 LLM 的",[24,74,75],{},"架构特征","。没有任何技术能 100% 消除它，只能压低发生率和影响范围。",[60,78,79],{"id":79},"具体原因",[81,82,83,89,95,101,107,113],"ol",{},[32,84,85,88],{},[24,86,87],{},"训练数据不足"," — 对某个话题了解不够，靠\"猜\"",[32,90,91,94],{},[24,92,93],{},"知识截止"," — 训练数据有截止日期，不知道最新信息",[32,96,97,100],{},[24,98,99],{},"过度泛化"," — 把某个领域的模式错误应用到另一个领域",[32,102,103,106],{},[24,104,105],{},"指令模糊"," — 用户的问题不够明确，模型自由发挥",[32,108,109,112],{},[24,110,111],{},"上下文冲突"," — 上下文中有矛盾信息，模型选择\"自圆其说\"",[32,114,115,118],{},[24,116,117],{},"RLHF 的副作用"," — 训练时被奖励\"自信地回答\"，于是不知道也硬答",[16,120,121],{"id":121},"幻觉的典型表现",[60,123,125],{"id":124},"_1-编造-api","1. 编造 API",[127,128,133],"pre",{"className":129,"code":131,"language":132},[130],"language-text","用户：FastAPI 怎么做 WebSocket 广播？\n模型：使用 FastAPI 的 broadcast() 方法...  ← 这个方法不存在\n","text",[134,135,131],"code",{"__ignoreMap":136},"",[60,138,140],{"id":139},"_2-虚构引用","2. 虚构引用",[127,142,145],{"className":143,"code":144,"language":132},[130],"用户：注意力机制最早是谁提出的？\n模型：根据 Smith et al. (2017) 的论文...  ← 这篇论文不存在\n",[134,146,144],{"__ignoreMap":136},[60,148,150],{"id":149},"_3-混淆概念","3. 混淆概念",[127,152,155],{"className":153,"code":154,"language":132},[130],"用户：MCP 和 A2A 有什么区别？\n模型：MCP 是 Google 提出的...  ← MCP 是 Anthropic 提出的\n",[134,156,154],{"__ignoreMap":136},[60,158,160],{"id":159},"_4-数字-单位幻觉","4. 数字 \u002F 单位幻觉",[127,162,165],{"className":163,"code":164,"language":132},[130],"用户：GPT-4 的训练数据有多少 token？\n模型：GPT-4 的训练数据约 13 万亿 token...  ← 数字凭感觉\n",[134,166,164],{"__ignoreMap":136},[16,168,169],{"id":169},"主流幻觉评测基准",[20,171,172],{},"知道怎么衡量，才能比较不同方案。常用基准：",[174,175,176,192],"table",{},[177,178,179],"thead",{},[180,181,182,186,189],"tr",{},[183,184,185],"th",{},"基准",[183,187,188],{},"关注点",[183,190,191],{},"评测方法",[193,194,195,209,222,235,249],"tbody",{},[180,196,197,203,206],{},[198,199,200],"td",{},[24,201,202],{},"TruthfulQA",[198,204,205],{},"模型在易误导问题上的真实性",[198,207,208],{},"多选 + 人评，看是否被常见误解带偏",[180,210,211,216,219],{},[198,212,213],{},[24,214,215],{},"HaluEval",[198,217,218],{},"摘要 \u002F QA \u002F 对话三类场景的幻觉率",[198,220,221],{},"生成 vs 标注事实对比",[180,223,224,229,232],{},[198,225,226],{},[24,227,228],{},"FActScore",[198,230,231],{},"长文事实密度",[198,233,234],{},"拆原子事实再 verify",[180,236,237,243,246],{},[198,238,239,242],{},[24,240,241],{},"SimpleQA","（OpenAI）",[198,244,245],{},"开放问答的事实准确率",[198,247,248],{},"标准答案匹配",[180,250,251,256,259],{},[198,252,253],{},[24,254,255],{},"Vectara HHEM",[198,257,258],{},"摘要场景幻觉",[198,260,261],{},"专门训练的 detector 打分",[20,263,264,265,268],{},"实务中",[24,266,267],{},"不要只看一个基准","。模型 A 在 TruthfulQA 高、SimpleQA 低，说明它\"会拒答易错题但事实知识不丰富\"——这跟你的业务匹配吗？",[16,270,271],{"id":271},"如何缓解幻觉",[60,273,275],{"id":274},"_1-rag检索增强生成","1. RAG（检索增强生成）",[20,277,278],{},"最有效的方法。在生成回答前先检索知识库，让模型基于真实文档回答。",[127,280,283],{"className":281,"code":282,"language":132},[130],"用户提问 → 检索知识库 → 检索到的文档 + 问题 → 模型生成 → 引用来源\n",[134,284,282],{"__ignoreMap":136},[20,286,287,288,291],{},"注意：RAG ",[24,289,290],{},"降低而非消除","幻觉。模型仍然可能：",[29,293,294,297,300],{},[32,295,296],{},"忽略检索结果，按自己\"知道的\"答",[32,298,299],{},"把检索结果里的信息张冠李戴",[32,301,302],{},"检索没命中时强答",[20,304,305,306,311],{},"详见 ",[307,308,310],"a",{"href":309},"\u002Fwiki\u002Frag.html","RAG","。",[60,313,315],{"id":314},"_2-grounded-generation强制引用","2. Grounded Generation：强制引用",[20,317,318,319,322],{},"在 prompt 里要求",[24,320,321],{},"每一句话都标注来源","，没有来源的不能说：",[127,324,327],{"className":325,"code":326,"language":132},[130],"你必须严格遵守：\n1. 只基于 \u003Ccontext> 内的信息回答\n2. 每个事实陈述后用 [doc-1] [doc-2] 标注来源\n3. 如果 context 里没有，回答\"提供的资料未涵盖这一点\"\n4. 不要补充任何 context 外的\"背景知识\"\n",[134,328,326],{"__ignoreMap":136},[20,330,331],{},"GPT-5 \u002F Claude Sonnet 4 \u002F Gemini 2.5 对此类指令服从度较高，老模型容易\"指令听了一半\"。",[60,333,335],{"id":334},"_3-verifier-二阶段","3. Verifier 二阶段",[20,337,338],{},"让另一个 LLM（甚至同一个 LLM 第二轮）专门检查首轮输出：",[127,340,343],{"className":341,"code":342,"language":132},[130],"[生成 Agent]            [Verifier Agent]\n回答 +  ─────────────►  逐句核对 context\n引用                    输出：✓ 准确 \u002F ✗ 幻觉句\n                          ↓\n                       有幻觉则要求重写\n",[134,344,342],{"__ignoreMap":136},[20,346,347],{},"代价：token 翻倍 + 延迟翻倍。但对法律 \u002F 医疗 \u002F 金融场景是必要投入。",[60,349,351],{"id":350},"_4-要求标注不确定性","4. 要求标注不确定性",[20,353,354],{},"在 prompt 中要求模型标注信心程度：",[127,356,359],{"className":357,"code":358,"language":132},[130],"回答时标注你的信心程度：\n[确定] 基于事实的回答\n[推测] 基于推理的推测\n[不确定] 缺乏足够信息\n",[134,360,358],{"__ignoreMap":136},[60,362,364],{"id":363},"_5-限定回答范围","5. 限定回答范围",[127,366,369],{"className":367,"code":368,"language":132},[130],"如果不知道，直接说\"我不知道\"。\n不要编造信息。\n只基于提供的上下文回答。\n",[134,370,368],{"__ignoreMap":136},[60,372,374],{"id":373},"_6-交叉验证","6. 交叉验证",[20,376,377],{},"对同一问题多次提问（不同温度 \u002F 不同 prompt 变体），比较答案的一致性。不一致的部分大概率含幻觉。",[60,379,381],{"id":380},"_7-使用推理模型","7. 使用推理模型",[20,383,384,385,388],{},"DeepSeek-R1、GPT-5、Claude Opus 4 thinking 等推理模型在回答前会先\"想一想\"，幻觉率显著低于非推理模型——但",[24,386,387],{},"会用更多 token","，按场景权衡。",[60,390,392],{"id":391},"_8-温度调低","8. 温度调低",[20,394,395,396,400],{},"将 ",[307,397,399],{"href":398},"\u002Fwiki\u002Ftemperature-top-p.html","temperature"," 设为 0 或 0.1，减少随机性，让模型更\"保守\"。",[16,402,403],{"id":403},"不同模型的幻觉率",[174,405,406,419],{},[177,407,408],{},[180,409,410,413,416],{},[183,411,412],{},"模型",[183,414,415],{},"幻觉率",[183,417,418],{},"特点",[193,420,421,432,442,453,463,474],{},[180,422,423,426,429],{},[198,424,425],{},"Claude Sonnet 4",[198,427,428],{},"低",[198,430,431],{},"安全性设计好，不确定时倾向说不知道",[180,433,434,437,439],{},[198,435,436],{},"GPT-5",[198,438,428],{},[198,440,441],{},"推理能力强，幻觉少",[180,443,444,447,450],{},[198,445,446],{},"GPT-4o",[198,448,449],{},"中",[198,451,452],{},"偶尔编造，交叉验证可发现",[180,454,455,458,460],{},[198,456,457],{},"Gemini 2.5 Pro",[198,459,449],{},[198,461,462],{},"长上下文下\"中间遗忘\"导致幻觉",[180,464,465,468,471],{},[198,466,467],{},"推理模型（o3 \u002F R1 \u002F Claude thinking）",[198,469,470],{},"显著降低",[198,472,473],{},"思维链过程会自我校验",[180,475,476,479,482],{},[198,477,478],{},"国产基础模型",[198,480,481],{},"中-高",[198,483,484],{},"英文场景幻觉率更高、专业术语易错",[16,486,488],{"id":487},"生产环境检测-pipeline","生产环境检测 Pipeline",[20,490,491],{},"把\"靠运气\"变成\"系统化防御\"，典型四层：",[127,493,496],{"className":494,"code":495,"language":132},[130],"用户问 → [1] Pre-Retrieval  → [2] Generation  → [3] Post-Check  → 返回\n                ↓ 检索        ↓ grounded gen     ↓ 自动 verifier\n                              ↓ 强制引用         ↓ 不通过 → 降级回复\n",[134,497,495],{"__ignoreMap":136},[20,499,500],{},"具体配置：",[174,502,503,516],{},[177,504,505],{},[180,506,507,510,513],{},[183,508,509],{},"层",[183,511,512],{},"做什么",[183,514,515],{},"工具\u002F手段",[193,517,518,531,544,557],{},[180,519,520,525,528],{},[198,521,522],{},[24,523,524],{},"Pre-Retrieval",[198,526,527],{},"没检索到时直接拒答而非硬猜",[198,529,530],{},"retrieval score 阈值、空召回兜底",[180,532,533,538,541],{},[198,534,535],{},[24,536,537],{},"Generation",[198,539,540],{},"grounded prompt + 强制 citation",[198,542,543],{},"system prompt 模板",[180,545,546,551,554],{},[198,547,548],{},[24,549,550],{},"Post-Check",[198,552,553],{},"句级 verifier 校对 context",[198,555,556],{},"LLM-as-judge \u002F Vectara HHEM \u002F 规则匹配",[180,558,559,564,567],{},[198,560,561],{},[24,562,563],{},"Logging",[198,565,566],{},"记录幻觉案例、归因复盘",[198,568,569],{},"把 Post-Check 不通过样本采样到评测集",[20,571,572,575],{},[24,573,574],{},"经验值","：加 Post-Check 一般能把幻觉率再压 30-50%，代价是延迟 +20-40%、成本 +50-80%。",[16,577,578],{"id":578},"开发者的幻觉检测清单",[20,580,581],{},"在应用中处理模型输出时，检查以下信号：",[81,583,584,587,590,593,596,599,602],{},[32,585,586],{},"✅ 回答中是否包含可验证的事实？→ 查证",[32,588,589],{},"✅ 是否引用了 URL\u002F论文\u002F文档？→ 验证是否存在",[32,591,592],{},"✅ 代码是否调用了 API\u002F函数？→ 查文档确认",[32,594,595],{},"✅ 回答是否与已知事实矛盾？→ 标记冲突",[32,597,598],{},"✅ 模型是否说\"我不知道\"？→ 这是好信号，不要惩罚",[32,600,601],{},"✅ 数字 \u002F 日期 \u002F 单位是否合理？→ 数量级粗算",[32,603,604],{},"✅ 多次重复提问，结果是否稳定？→ 答案飘忽 = 高幻觉风险",[16,606,607],{"id":607},"延伸阅读",[29,609,610,616,622,629],{},[32,611,612,613],{},"缓解方案：",[307,614,615],{"href":309},"RAG（检索增强生成）",[32,617,618,619],{},"控制随机性：",[307,620,621],{"href":398},"Temperature 与 Top-P",[32,623,624,625],{},"上下文组装：",[307,626,628],{"href":627},"\u002Fwiki\u002Fcontext-engineering.html","Context Engineering",[32,630,631,632],{},"行为微调：",[307,633,635],{"href":634},"\u002Fwiki\u002Ffine-tuning-vs-rag.html","Fine-tuning vs RAG",{"title":136,"searchDepth":637,"depth":637,"links":638},3,[639,641,645,651,652,662,663,664,665],{"id":18,"depth":640,"text":18},2,{"id":58,"depth":640,"text":58,"children":642},[643,644],{"id":62,"depth":637,"text":63},{"id":79,"depth":637,"text":79},{"id":121,"depth":640,"text":121,"children":646},[647,648,649,650],{"id":124,"depth":637,"text":125},{"id":139,"depth":637,"text":140},{"id":149,"depth":637,"text":150},{"id":159,"depth":637,"text":160},{"id":169,"depth":640,"text":169},{"id":271,"depth":640,"text":271,"children":653},[654,655,656,657,658,659,660,661],{"id":274,"depth":637,"text":275},{"id":314,"depth":637,"text":315},{"id":334,"depth":637,"text":335},{"id":350,"depth":637,"text":351},{"id":363,"depth":637,"text":364},{"id":373,"depth":637,"text":374},{"id":380,"depth":637,"text":381},{"id":391,"depth":637,"text":392},{"id":403,"depth":640,"text":403},{"id":487,"depth":640,"text":488},{"id":578,"depth":640,"text":578},{"id":607,"depth":640,"text":607},"concept","md",{},true,"\u002Fwiki\u002Fhallucination","2026-06-21",[673,674],"claude-sonnet-4","gpt-5",null,{"title":11,"description":136},"hallucination","wiki\u002Fhallucination","大模型生成看似合理但事实上错误或虚构的内容。幻觉是 LLM 最大的可靠性挑战，无法完全消除，但可以通过多种方法缓解。",[681,682,310],"幻觉","可靠性","F8svYQF3vhbhnhHvCWK75hp0TltZ7PhOCBdUT-W_a7k",1782316490740]