{"id":47416,"date":"2024-01-02T18:19:02","date_gmt":"2024-01-02T10:19:02","guid":{"rendered":"https:\/\/swarma.org\/?p=47416"},"modified":"2024-01-02T18:19:02","modified_gmt":"2024-01-02T10:19:02","slug":"ai%e5%ae%89%e5%85%a8%e5%89%8d%e6%b2%bf-%e5%a4%a7%e6%a8%a1%e5%9e%8b%e8%b0%84%e5%aa%9a%e7%8e%b0%e8%b1%a1%e3%80%81rlhf%e5%90%8e%e9%97%a8%e6%94%bb%e5%87%bb%e3%80%81ai4science%e6%a8%a1%e5%9e%8b%e7%9a%84","status":"publish","type":"post","link":"https:\/\/swarma.org\/?p=47416","title":{"rendered":"AI\u5b89\u5168\u524d\u6cbf | \u5927\u6a21\u578b\u8c04\u5a9a\u73b0\u8c61\u3001RLHF\u540e\u95e8\u653b\u51fb\u3001AI4Science\u6a21\u578b\u7684\u6ee5\u7528\u98ce\u9669\u3001\u6001\u52bf\u611f\u77e5\u80fd\u529b\u3001\u8868\u5f81\u5de5\u7a0b"},"content":{"rendered":"<div class='wxsyncmain'>\n<section style=\"font-size: 16px;\" data-mpa-powered-by=\"yiban.io\">\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><img class=\"rich_pages wxw-img\" data-galleryid=\"\" data-imgfileid=\"100007923\" data-ratio=\"0.5625\" data-s=\"300,640\"  data-type=\"png\" data-w=\"960\" style=\"text-align: center;font-family: mp-quote, -apple-system-font, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;letter-spacing: 0.034em;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-afb35a4b37855066d344a191e307310b.png\"  \/><br  \/><\/p>\n<p style=\"text-wrap: wrap;text-align: center;\" powered-by=\"xiumi.us\"><span style=\"font-size: 14px;\"><span style=\"font-size: 14px;letter-spacing: 0.578px;text-align: center;text-wrap: wrap;\">AI\u5b89\u5168\u56db\u5927\u6293\u624b\uff1a<\/span><span style=\"font-size: 14px;letter-spacing: 0.578px;text-align: center;text-wrap: wrap;\">\u5bf9\u9f50\u3001\u9c81\u68d2\u6027\u3001\u76d1\u6d4b\u3001\u7cfb\u7edf\u5b89\u5168\u6027 &#8211;&nbsp;<\/span>\u6765\u81ea<a target=\"_blank\" href=\"https:\/\/mp.weixin.qq.com\/s?__biz=Mzg4NTgxNjEwMg==&amp;mid=2247491975&amp;idx=1&amp;sn=5acf1d1ad903a16d253206adee927f28&amp;chksm=cfa18fcef8d606d860de4c7cbccaf407cbd69e141e0865b634faddfabd3b507349e43add34e2&amp;token=2119150832&amp;lang=zh_CN&amp;scene=21#wechat_redirect\" textvalue=\"\u300aAI\u5b89\u5168\u524d\u6cbf #1\u300b\" linktype=\"text\" imgurl=\"\" imgdata=\"null\" tab=\"innerlink\" data-linktype=\"2\" rel=\"noopener noreferrer\">\u300aAI\u5b89\u5168\u524d\u6cbf #1\u300b<\/a><\/span><\/p>\n<p style=\"text-wrap: wrap;text-align: center;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;\u672c\u671f\u8981\u95fb\u76ee\u5f55<span style=\"font-size: 14px;\">\uff08\u4e0a\u4e0b\u6ed1\u52a8\u67e5\u770b\uff09<\/span><\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"transform: translate3d(0px, 0px, 0px);text-align: center;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;height: 372px;vertical-align: top;overflow-y: auto;padding: 10px;background-color: rgb(255, 255, 255);border-left: 15px solid rgb(21, 59, 179);\">\n<section style=\"overflow: hidden;\">\n<section style=\"margin-right: 0%;margin-bottom: 5px;margin-left: 0%;\" powered-by=\"xiumi.us\">\n<section style=\"color: rgb(160, 160, 160);text-align: left;line-height: 1.8;padding-right: 20px;padding-left: 20px;\">\n<p><strong><span style=\"font-size: 20px;color: rgb(21, 59, 179);\">Part 1<\/span><\/strong><br  \/><\/p>\n<p><strong><span style=\"color: rgb(62, 62, 62);\">\u5bf9\u9f50\uff08Alignment\uff09<\/span><\/strong><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">1. \u5982\u4f55\u7406\u89e3\u5927\u6a21\u578b\u7684\u201c\u8c04\u5a9a\u201d<\/span><span style=\"font-size: 15px;color: rgb(62, 62, 62);\">\uff08Sycophancy\uff09<\/span><span style=\"color: rgb(62, 62, 62);\">\u73b0\u8c61\uff1f<br  \/><\/span><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">2. \u9488\u5bf9\u5927\u6a21\u578b\u6001\u52bf\u611f\u77e5\u80fd\u529b<\/span><span style=\"font-size: 15px;color: rgb(62, 62, 62);\">\uff08Situational Awareness\uff09<\/span><span style=\"color: rgb(62, 62, 62);\">\u7684\u63a2\u7d22<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong><span style=\"font-size: 20px;color: rgb(21, 59, 179);\">Part 2<\/span><\/strong><\/p>\n<p><strong><span style=\"color: rgb(62, 62, 62);\">\u9c81\u68d2\u6027\uff08Robustness\uff09<\/span><\/strong><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">1. \u901a\u8fc7RLHF\u8bbe\u7f6e\u540e\u95e8\u8d8a\u72f1\u5927\u6a21\u578b<\/span><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">2. \u4f7f\u7528\u504f\u79bb\u653b\u51fb\u4ece\u8bed\u8a00\u6a21\u578b\u4e2d\u63d0\u53d6\u8bad\u7ec3\u6570\u636e<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong><span style=\"font-size: 20px;color: rgb(21, 59, 179);\">PART 3<\/span><\/strong><\/p>\n<p><strong><span style=\"color: rgb(62, 62, 62);\">\u76d1\u6d4b\uff08Monitoring\uff09<\/span><\/strong><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">1. \u8868\u5f81\u5de5\u7a0b\uff1a\u81ea\u4e0a\u800c\u4e0b\u5b9e\u73b0AI\u53ef\u89e3\u91ca\u6027<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong><span style=\"font-size: 20px;color: rgb(21, 59, 179);\">PART 4<\/span><\/strong><\/p>\n<p><strong><span style=\"color: rgb(62, 62, 62);\">\u7cfb\u7edf\u5b89\u5168\u6027\uff08Systemic Safety\uff09<\/span><\/strong><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">1.&nbsp;\u63a7\u5236AI4Science\u6a21\u578b\u7684\u6ee5\u7528\u98ce\u9669<\/span><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\"><br  \/><\/span><\/p>\n<p><strong><span style=\"font-size: 20px;color: rgb(21, 59, 179);\">PART 5<\/span><\/strong><\/p>\n<p><strong><span style=\"color: rgb(62, 62, 62);\">\u884c\u52a8\u6307\u5357<\/span><\/strong><\/p>\n<p><span style=\"color: rgb(62, 62, 62);\">1.&nbsp;\u56fd\u5bb6\u81ea\u7136\u79d1\u5b66\u57fa\u91d1\u59d4\u5458\u4f1a\u201c\u751f\u6210\u5f0f\u4eba\u5de5\u667a\u80fd\u57fa\u7840\u7814\u7a76\u201d\u4e13\u9879\u9879\u76ee<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 22%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012862\" data-ratio=\"0.4763458401305057\" data-s=\"300,640\"  data-type=\"png\" data-w=\"613\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-ee901fe1085664e82ba8aec1ff4babbb.png\"  \/><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;PART 1<\/strong><\/p>\n<p><strong>\u5bf9\u9f50\uff08Alignment\uff09<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;margin-top: 3px;\" powered-by=\"xiumi.us\">\n<section style=\"background-color: rgb(217, 217, 217);height: 1px;\"><svg viewbox=\"0 0 1 1\" style=\"float:left;line-height:0;width:0;vertical-align:top;\"><\/svg><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;padding: 20px 10px;height: auto;background-color: rgba(21, 59, 179, 0.09);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>01<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-align: left;text-wrap: wrap;\"><strong>\u5982\u4f55\u7406\u89e3\u5927\u6a21\u578b\u7684\u201c\u8c04\u5a9a&#8221;<\/strong><span style=\"font-size: 14px;\"><strong>\uff08Sycophancy\uff09<\/strong><\/span><strong>\u73b0\u8c61\uff1f<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u5173\u4e8e\u4ec0\u4e48\uff1a\u5927\u6a21\u578b\u7684\u8c04\u5a9a\u73b0\u8c61<\/strong><\/p>\n<p><strong><br  \/><\/strong><\/p>\n<ul class=\"list-paddingleft-1\" style=\"list-style-type: circle;\">\n<li>\n<p><span style=\"font-size: 14px;\">\u4eca\u5e7410\u6708\uff0cAnthropic \u7684\u7814\u7a76\u8005\u53d1\u5e03\u4e86\u300aTowards understanding sycophancy in language models\u300b\uff0c\u7814\u7a76\u4e86\u5927\u6a21\u578b\u4ea7\u751f\u7684\u8c04\u5a9a\u73b0\u8c61\u3002<\/span><\/p>\n<\/li>\n<li>\n<p><span style=\"font-size: 14px;color: rgb(21, 59, 179);\">\u8c04\u5a9a\uff08Sycophancy\uff09\u884c\u4e3a\u5728\u8fd9\u91cc\u6307\u7684\u662fAI\u6a21\u578b\u4ea7\u751f\u7684\u56de\u5e94\u8d8b\u5411\u4e8e\u7b26\u5408\u7528\u6237\u7684\u7acb\u573a\u6216\u504f\u597d\uff0c\u4f46\u6709\u65f6\u53ef\u80fd\u4ee5\u727a\u7272\u771f\u5b9e\u6027\u6216\u51c6\u786e\u6027\u4e3a\u4ee3\u4ef7\u7684\u884c\u4e3a<\/span><span style=\"font-size: 14px;\">\u3002<\/span><span style=\"font-size: 14px;\">\u6587\u7ae0\u4e2d\u7814\u7a76\u4e865\u4e2a\u5f53\u524d\u6700\u9886\u5148\u7684AI\u7cfb\u7edf\uff1a<\/span><span style=\"font-size: 14px;\">Claude 1.3\u3001Claude 2\u3001GPT-3.5\u3001GPT-4\u3001LLaMA 2\uff0c\u53d1\u73b0\u8fd9\u4e9b\u7cfb\u7edf\u4e00\u81f4\u5730\u5c55\u73b0\u4e86\u8fd9\u6837\u7684\u884c\u4e3a\u3002<\/span><\/p>\n<\/li>\n<\/ul>\n<p><span style=\"font-size: 14px;\">\u25cf \u7814\u7a76\u63ed\u793a\u4e86\u5404\u79cd AI \u52a9\u624b\u4e2d\u4e00\u81f4\u7684\u8c04\u5a9a\u884c\u4e3a\u6a21\u5f0f\uff0c\u5305\u62ec&nbsp;AI&nbsp;\u6a21\u578b\u503e\u5411\u4e8e<span style=\"color: rgb(21, 59, 179);\">\u201c\u9519\u8bef\u5730\u627f\u8ba4\u9519\u8bef\u201d\u3001\u201c\u57fa\u4e8e\u7528\u6237\u504f\u597d\u63d0\u4f9b\u6709\u504f\u89c1\u7684\u53cd\u9988\u201d\uff0c\u4ee5\u53ca\u201c\u6a21\u4eff\u7528\u6237\u7684\u9519\u8bef\u201d\u7684\u884c\u4e3a<\/span>\u3002<\/span><br  \/><\/p>\n<p><br  \/><\/p>\n<p><strong>\u5206\u7c7b\u8bc4\u4f30\u8bed\u8a00\u6a21\u578b\u4e2d\u7684\u201c\u8c04\u5a9a\u201d\u884c\u4e3a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u8bbe\u8ba1\u4e86\u56db\u4e2a\u8bc4\u4f30\u6570\u636e\u96c6\u6765\u8bc4\u4f30\u8c00\u8bcd\u884c\u4e3a\uff0c\u5206\u522b\u662f\uff1a\u201c\u8bf1\u5bfc\u6027\u53cd\u9988\u8c00\u8bcd\u201d\uff08Feedback Sycophancy\uff09\u3001\u201c\u5899\u5934\u8349\u8c00\u8bcd\u201d\uff08Are You Sure\uff1fSycophancy\uff09\u3001\u201c\u8fce\u5408\u7528\u6237\u89c2\u70b9\u8c00\u8bcd\u201d\uff08Answer Sycophancy\uff09\u3001\u201c\u6a21\u4eff\u7528\u6237\u9519\u8bef\u8c00\u8bcd\u201d\uff08Mimicry Sycophancy\uff09<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4e3e\u4e2a\u4f8b\u5b50\uff0c\u4e00\u822c\u6765\u8bf4\uff0c\u6211\u4eec\u53ef\u80fd\u4f1a\u5411\u8bed\u8a00\u6a21\u578b\u5f81\u6c42\u5efa\u8bae\u6216\u8005\u610f\u89c1\u3002\u6bd4\u5982\u8bf4\uff1a\u201c\u8bf7\u8bc4\u8bba\u4ee5\u4e0b\u7684\u8a00\u8bba\u3002&lt;\u67d0\u4e2a\u8a00\u8bba&gt;\u201d\u3002\u4f46\u662f\u7814\u7a76\u8005\u53d1\u73b0\uff0c\u5047\u5982\u6211\u4eec\u5728 prompt \u4e2d\u8868\u8fbe\u4e86\u5bf9\u4e8e\u8fd9\u4e2a\u8a00\u8bba\u7684\u559c\u597d\uff0c \u8bed\u8a00\u6a21\u578b\u5c31\u6709\u53ef\u80fd\u56e0\u4e3a\u4e3a\u4e86\u8fce\u5408\u7528\u6237\u7684\u559c\u597d\u800c\u6539\u53d8\u7acb\u573a\u3002\u4f5c\u8005\u628a\u8fd9\u79cd\u8c04\u5a9a\u79f0\u4e3a\u201d\u8bf1\u5bfc\u6027\u53cd\u9988\u8c04\u5a9a\u201d\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012865\" data-ratio=\"0.5534188034188035\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-d7f6bacd2742fe9037d5fe2c4be05c97.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u201c\u8c04\u5a9a\u201d\u884c\u4e3a\u53ef\u80fd\u7684\u6765\u6e90<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4f5c\u8005\u53d1\u73b0\u8fd9\u6837\u7684\u884c\u4e3a\u5728\u4e94\u79cd\u4e0d\u540c\u7684 AI assistant \u4e2d\u90fd\u6709\u4ea7\u751f\uff0c\u56e0\u6b64\u8fd9\u6837\u7684\u8c04\u5a9a\u884c\u4e3a\u53ef\u80fd\u786e\u5b9e\u6765\u6e90\u4e8e<span style=\"color: rgb(21, 59, 179);\">\u8fd9\u4e9b\u6a21\u578b\u8bad\u7ec3\u65b9\u5f0f\u7684\u7279\u6027<\/span>\uff0c\u800c\u4e0d\u662f\u67d0\u4e2a\u7279\u5b9a\u7cfb\u7edf\u7684\u7279\u6709\u7ec6\u8282\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4f5c\u8005\u9996\u5148\u5206\u6790\u4e86 Anthropic \u7684 hh-rlhf \u6570\u636e\u96c6\u3002\u5bf9\u4e8e\u6bcf\u4e2a\u6210\u5bf9\u504f\u597d\u6bd4\u8f83\uff0c\u4f5c\u8005\u4f7f\u7528\u8bed\u8a00\u6a21\u578b\u751f\u6210\u53ef\u8bfb\u7684\u201c\u6807\u7b7e\u201d\uff0c\u6bd4\u5982<span style=\"color: rgb(21, 59, 179);\">\u201c\u5339\u914d\u7528\u6237\u7acb\u573a\u201d<\/span>\u3001\u201c\u771f\u5b9e\u7684\u201d\u3001\u201c\u6709\u8da3\u7684\u201d\u7b49\u7b49\u3002\u7136\u540e\uff0c\u4f5c\u8005\u901a\u8fc7\u8d1d\u53f6\u65af\u903b\u8f91\u56de\u5f52\u6a21\u578b\u57fa\u4e8e\u8fd9\u4e9b\u7279\u5f81\u6765\u9884\u6d4b\u4eba\u7c7b\u4f1a\u559c\u6b22\u54ea\u4e00\u4e2a\u56de\u7b54\u3002\u4f5c\u8005\u53d1\u73b0\u8fd9\u4e2a\u6a21\u578b\u5b66\u4e60\u5230<span style=\"color: rgb(21, 59, 179);\">\u201c\u5339\u914d\u7528\u6237\u7acb\u573a\u201d<\/span>\u662f\u5bf9\u4eba\u7c7b\u504f\u597d\u5224\u65ad\u7684\u6700\u5177\u9884\u6d4b\u6027\u7684\u7279\u5f81\u4e4b\u4e00\uff0c\u8868\u660e\u504f\u597d\u6570\u636e\u786e\u5b9e\u6fc0\u52b1\u4e86\u6a21\u578b\u8fce\u5408\u7528\u6237\u7684\u89c2\u70b9\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012863\" data-ratio=\"0.56282722513089\" data-s=\"300,640\"  data-type=\"png\" data-w=\"764\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-dc4f9783bca3c4a36177325f8ea0fad9.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><span style=\"font-size: 14px;\">\u25cf \u53e6\u5916\u4f5c\u8005\u57fa\u4e8e Claude 2 \u7684\u504f\u597d\u6a21\u578b\u7528 RL \u548c Best-of-N \u91c7\u6837\u5bf9\u8bed\u8a00\u6a21\u578b\u8fdb\u884c\u4e86\u5fae\u8c03\uff0c\u53d1\u73b0\u4e00\u4e9b\u5f62\u5f0f\u7684\u8c04\u5a9a\u4f1a\u56e0\u6b64\u589e\u52a0\uff0c\u4f46\u662f\u53e6\u4e00\u4e9b\u5f62\u5f0f\u7684\u8c04\u5a9a\u4f1a\u56e0\u6b64\u964d\u4f4e\u3002\u5c3d\u7ba1\u5982\u6b64\uff0c\u4f5c\u8005\u4e5f\u786e\u5b9e\u53d1\u73b0 Claude 2 PM \u6709\u65f6\u66f4\u559c\u6b22\u8c04\u5a9a\u56de\u5e94\u800c\u975e\u771f\u5b9e\u56de\u5e94\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012864\" data-ratio=\"0.48504273504273504\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-246e2bd8d03ba126e8ced38d9a5c0222.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"font-family: PingFangSC-light;color: rgb(62, 62, 62);letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><span style=\"font-size: 14px;\">\u25cf<span style=\"color: rgb(21, 59, 179);\"> \u8fdb\u4e00\u6b65\u5730\uff0c\u6a21\u578b\u77e5\u9053\u81ea\u5df1\u5728\u8c04\u5a9a\u5417\uff1f<\/span>\u5bf9\u4e8e\u8fd9\u4e2a\u95ee\u9898\uff0c\u4f5c\u8005\u901a\u8fc7\u5728\u5f85\u8bc4\u4f30\u56de\u5e94\u524d\u52a0\u4e0a\u7528\u6237\u8981\u6c42\u4e0d\u5f97\u8c04\u5a9a\u7684\u8bf4\u660e\uff0c\u7136\u540e\u57fa\u4e8e Claude 2 \u7684 PM \u7528 Best-of-N \u91c7\u6837\u6765\u751f\u6210\u771f\u5b9e\u800c\u4e0d\u8c04\u5a9a\u7684\u56de\u5e94\uff0c\u53d1\u73b0\u786e\u5b9e\u6709\u6548\u679c\u7684\u5dee\u5f02\u3002\u8fd9\u8bf4\u660e PM \u6709\u80fd\u529b\u5728\u4e00\u5b9a\u7a0b\u5ea6\u4e0a\u68c0\u6d4b\u5230\u4ec0\u4e48\u662f\u771f\u5b9e\u7684\u56de\u5e94\uff0c\u4f46\u662f\u4ecd\u7136\u4f1a\u504f\u597d\u8c04\u5a9a\u7684\u56de\u5e94\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6700\u540e\uff0c\u4f5c\u8005\u8fd8\u53d1\u73b0\u4eba\u7c7b\u548c\u504f\u597d\u6a21\u578b\u90fd\u66f4\u559c\u6b22\u770b\u8d77\u6765\u5177\u6709\u8bf4\u670d\u529b\u7684\u56de\u5e94\uff0c\u5373\u4f7f\u8fd9\u4e9b\u56de\u5e94\u662f\u5728\u91cd\u590d\u7528\u6237\u9519\u8bef\u7684\u8ba4\u77e5\uff0c\u800c\u4e0d\u662f\u7ea0\u6b63\u7528\u6237\u7684\u9519\u8bef\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012869\" data-ratio=\"0.8055555555555556\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-a4c94158c284bd8c10cfafd29b1ead6b.png\"  \/><\/section>\n<\/section>\n<section style=\"font-family: PingFangSC-light;color: rgb(62, 62, 62);letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><strong>\u4e3a\u4f55\u91cd\u8981<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u5728<\/span><a href=\"https:\/\/mp.weixin.qq.com\/s?__biz=Mzg4NTgxNjEwMg==&amp;mid=2247491975&amp;idx=1&amp;sn=5acf1d1ad903a16d253206adee927f28&amp;scene=21#wechat_redirect\" target=\"_blank\" data-linktype=\"2\" style=\"text-decoration: underline;color: rgb(98, 132, 162);font-size: 14px;\" rel=\"noopener noreferrer\"><span style=\"text-decoration: underline;color: rgb(98, 132, 162);font-size: 14px;\">\u300aRLHF \u7684\u5f00\u653e\u95ee\u9898\u4e0e\u6839\u672c\u5c40\u9650\u300b<\/span><\/a><span style=\"font-size: 14px;\">\u4e00\u6587\u4e2d\uff0c\u4f5c\u8005\u4ece\u4eba\u7c7b\u53cd\u9988\u6570\u636e\u3001\u504f\u597d\u6a21\u578b\u5b66\u4e60\u548c\u7b56\u7565\u4f18\u5316\u4e09\u4e2a\u65b9\u9762\u8ba8\u8bba\u4e86 RLHF \u6280\u672f\u7684\u5f00\u653e\u95ee\u9898\u548c\u6839\u672c\u5c40\u9650\u3002\u5927\u6a21\u578b\u7684\u8c04\u5a9a\u884c\u4e3a\u5c31\u662f RLHF \u4e09\u4e2a\u95ee\u9898\u7684\u5177\u4f53\u8868\u73b0\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf <span style=\"color: rgb(21, 59, 179);\">\u5728\u4eba\u7c7b\u53cd\u9988\u6570\u636e\u5c42\u9762<\/span>\uff0c\u4f5c\u8005\u53d1\u73b0\u6807\u6ce8\u8005\u6807\u6ce8\u7684\u6570\u636e\u672c\u8eab\u5b58\u5728\u7f3a\u9677\uff0c\u4f53\u73b0\u5728 &#8211; \u5bf9\u4e8e\u4eba\u7c7b\u504f\u597d\u6700\u6709\u89e3\u91ca\u529b\u7684\u6570\u636e\u7279\u5f81\u5c31\u662f\u56de\u5e94\u7528\u6237\u7684\u7acb\u573a\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf <span style=\"color: rgb(21, 59, 179);\">\u5728\u504f\u597d\u6a21\u578b\u5b66\u4e60\u5c42\u9762<\/span>\uff0c\u4f5c\u8005\u53d1\u73b0 Claude 2 \u7684 PM \u6709\u65f6\u4f1a\u504f\u597d\u8c04\u5a9a\u56de\u5e94\u800c\u4e0d\u662f\u771f\u5b9e\u56de\u5e94\u3002\u4eba\u7c7b\u548c\u504f\u597d\u6a21\u578b\u90fd\u66f4\u559c\u6b22\u770b\u8d77\u6765\u5177\u6709\u8bf4\u670d\u529b\u7684\u56de\u5e94\uff0c\u5373\u4f7f\u8fd9\u4e9b\u56de\u5e94\u662f\u5728\u91cd\u590d\u7528\u6237\u9519\u8bef\u7684\u8ba4\u77e5\uff0c\u800c\u4e0d\u662f\u7ea0\u6b63\u7528\u6237\u7684\u9519\u8bef\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf <span style=\"color: rgb(21, 59, 179);\">\u5728\u7b56\u7565\u4f18\u5316\u5c42\u9762<\/span>\uff0c\u4f5c\u8005\u4f7f\u7528\u4e86 RL \u548c Best-of-N \u91c7\u6837\u7684\u65b9\u6cd5\u5bf9\u6a21\u578b\u8fdb\u884c\u4e86\u4f18\u5316\uff0c\u4e5f\u53d1\u73b0\u66f4\u591a\u7684\u4f18\u5316\u4f1a\u5bfc\u81f4\u4e00\u4e9b\u5f62\u6001\u7684\u8c04\u5a9a\u884c\u4e3a\u7684\u589e\u5f3a\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u8fd9\u4e9b\u53d1\u73b0\u5f3a\u8c03\u4e86\u5f00\u53d1\u8d85\u8d8a\u4ec5\u4f9d\u8d56\u4eba\u7c7b\u8bc4\u5206\u7684\u8bad\u7ec3\u65b9\u6cd5\u7684\u5fc5\u8981\u6027\u3002\u5982\u679c\u60f3\u8981\u51cf\u5c11\u8c04\u5a9a\u884c\u4e3a\u7684\u4ea7\u751f\uff0c\u4e00\u79cd\u65b9\u6cd5\u53ef\u80fd\u662f\u6539\u5584\u504f\u597d\u6a21\u578b\uff0c\u6bd4\u5982\u901a\u8fc7\u6a21\u578b\u534f\u52a9\u6807\u6ce8\u8005\u504f\u597d\u6807\u6ce8\u3002\u53e6\u5916\u7684\u65b9\u5f0f\u5305\u62ec\u5408\u6210\u6570\u636e\u5fae\u8c03\uff08synthetic finetuning\uff09\uff0c\u6fc0\u6d3b\u6539\u8fdb\uff08activation steering\uff09\u6216\u8005\u4e00\u4e9b\u5176\u4ed6\u53ef\u62d3\u5c55\u76d1\u7763\u7684\u65b9\u6cd5\uff0c\u4f8b\u5982\u8fa9\u8bba\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf Sharma, Mrinank, et al. &#8220;Towards understanding sycophancy in language models.&#8221; arXiv preprint arXiv:2310.13548 (Anthropic, 2023\/10)<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u66f4\u591a\u76f8\u5173\u9605\u8bfb\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf Wei, Jerry, et al. \u201cSimple synthetic data reduces sycophancy in large language models\u201d arXiv preprint arXiv:2308.03958 (Google DeepMind, 2023\/08)<\/span><\/p>\n<p><span style=\"font-size: 14px;\">&nbsp;&nbsp;&nbsp;&nbsp;<\/span><span style=\"font-size: 12px;\">\u8fd9\u7bc7\u6587\u7ae0\u63cf\u8ff0\u4e86\u7c7b\u4f3c\u7684\u8c04\u5a9a\u884c\u4e3a\u3002\u9996\u5148\u53d1\u73b0\u66f4\u5927\u7684\u6a21\u578b\u548c\u6307\u4ee4\u5fae\u8c03\u4e5f\u4f1a\u52a0\u5267\u8c04\u5a9a\uff08 PaLM-8B \u5230 PaLM-62B \u63d0\u5347\u4e86\uff5e20%\u7684\u8c04\u5a9a\u884c\u4e3a\uff0cPaLM-8B\u7ecf\u8fc7\u6307\u4ee4\u5fae\u8c03\u4e4b\u540e\u63d0\u5347\u4e86\uff5e26%\u7684\u7b26\u5408\u7528\u6237\u89c2\u70b9\u7684\u884c\u4e3a\u3002\uff09\u3002\u6587\u7ae0\u6700\u7ec8\u63d0\u51fa\u4e86\u4e00\u4e2a\u975e\u5e38\u7b80\u5355\u7684\u4ece\u6837\u677f\u4e2d\u5408\u6210\u6570\u636e\u7684\u65b9\u6cd5\uff0c\u63a5\u7740\u8fdb\u884c\u5fae\u8c03\u4ee5\u964d\u4f4e\u4e86\u8c04\u5a9a\u884c\u4e3a\u7684\u9891\u7387\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf Perez, Ethan, et al. \u201cDiscovering Language Model Behaviors with Model-Written Evaluations\u201d arXiv preprint arXiv:2310.13548 (Anthropic, 2022\/12)<\/span><\/p>\n<p><span style=\"font-size: 14px;\">&nbsp;&nbsp;<span style=\"color: rgb(160, 160, 160);\">&nbsp; <\/span><\/span><span style=\"font-size: 12px;\">Anthropic\u53bb\u5e74\u5e74\u5e95\u7684\u5de5\u4f5c\uff0c\u4f7f\u7528\u8bed\u8a00\u6a21\u578b\u81ea\u52a8\u751f\u6210\u8bc4\u4f30\u3002\u6587\u4e2d\u9996\u6b21\u53d1\u73b0\u4e86\u6a21\u578b\u8c04\u5a9a\u884c\u4e3a\u7684\u53cd\u5411\u62d3\u5c55\uff08inverse scaling\uff09 &#8211; \u8f83\u5927\u7684\u6a21\u578b\u4f1a\u91cd\u590d\u56de\u7b54\u5bf9\u8bdd\u7528\u6237\u504f\u597d\u7684\u7b54\u6848\uff0c\u800c\u4e0d\u662f\u771f\u5b9e\u7b54\u6848\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 13%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012866\" data-ratio=\"0.4763458401305057\" data-s=\"300,640\"  data-type=\"png\" data-w=\"613\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-ee901fe1085664e82ba8aec1ff4babbb.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>02<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u9488\u5bf9\u5927\u6a21\u578b\u6001\u52bf\u611f\u77e5\u80fd\u529b<span style=\"font-size: 14px;\">\uff08Situational Awareness\uff09<\/span>\u7684\u63a2\u7d22<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 97%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012867\" data-ratio=\"0.5876068376068376\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-fa0a8b545ddbb3c10314b1fe38de2ee5.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><strong>\u5173\u4e8e\u4ec0\u4e48\uff1a\u4ece\u6001\u52bf\u611f\u77e5\u5230\u4e0a\u4e0b\u6587\u5916\u63a8\u7406<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u725b\u6d25\u5927\u5b66\u7684\u7814\u7a76\u8005 Owain Evans\uff08\u4e5f\u662fTruthfulQA\u7684\u4f5c\u8005\uff09\u5e26\u9886\u7684\u56e2\u961f\u8fd1\u671f\u53d1\u8868\u4e86\u9898\u4e3a<span style=\"font-family: &quot;Noto Sans CJK SC&quot;, -apple-system-font, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;\">\u300aTaken out of context: On measuring situational awareness in LLMs\u300b<\/span>\u7684\u6587\u7ae0\uff0c<span style=\"color: rgb(21, 59, 179);\">\u8bd5\u56fe\u901a\u8fc7\u8861\u91cf\u5927\u6a21\u578b\u7684\u201c\u4e0a\u4e0b\u6587\u5916\u63a8\u7406\u201d\uff08out-of-context reasoning\uff09\u80fd\u529b\u6765\u63a2\u7a76\u6a21\u578b\u7684\u201c\u6001\u52bf\u611f\u77e5\u201d\uff08situational awareness\uff09\u80fd\u529b<\/span>\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u7814\u7a76\u7684\u662f\u5927\u6a21\u578b\u62d3\u5c55\u8fc7\u7a0b\u4e2d\u53ef\u80fd\u51fa\u73b0\u7684\u4e00\u79cd\u65b0\u80fd\u529b\uff1a\u6001\u52bf\u611f\u77e5\u3002\u6001\u52bf\u611f\u77e5\u6307\u7684\u662f<span style=\"color: rgb(21, 59, 179);\">\u201c\u6a21\u578b\u8ba4\u8bc6\u5230\u81ea\u5df1\u662f\u6a21\u578b\uff0c\u5e76\u4e14\u80fd\u591f\u8fa8\u8bc6\u81ea\u5df1\u662f\u5904\u4e8e\u8bad\u7ec3\u3001\u6d4b\u8bd5\u8fd8\u662f\u90e8\u7f72\u9636\u6bb5\u201d<\/span>\u7684\u80fd\u529b\u3002\u5177\u4f53\u5730\u8bf4\uff0c\u6a21\u578b\u53ef\u4ee5\u5728\u5b89\u5168\u6d4b\u8bd5\u65f6\u8ba4\u8bc6\u5230\u4e0d\u901a\u8fc7\u6d4b\u8bd5\u7684\u540e\u679c\uff08\u5373\u88ab\u4eba\u7c7b\u53d6\u6d88\u90e8\u7f72\uff09\uff0c\u4ece\u800c\u6709\u52a8\u673a\u5728\u6d4b\u8bd5\u9636\u6bb5\u6709\u610f\u9690\u85cf\u81ea\u5df1\u7684\u4e0d\u5b89\u5168\u884c\u4e3a\u503e\u5411\uff0c\u800c\u53ea\u5728\u90e8\u7f72\u540e\u8ffd\u5bfb\u4e0d\u5b89\u5168\u7684\u76ee\u6807\u2014\u2014\u8fd9\u79cd\u73b0\u8c61\u88ab\u79f0\u4e3a<span style=\"color: rgb(21, 59, 179);\">\u6b3a\u9a97\u6027\u5bf9\u9f50\uff08deceptive alignment\uff09<\/span>\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u5f53\u524d\u7684\u6a21\u578b\u8fd8\u4e0d\u5177\u5907\u6b64\u7c7b\u80fd\u529b\uff0c\u4f46\u662f\u4f5c\u8005\u5e0c\u671b\u7814\u7a76\u5f53\u524d\u6a21\u578b\u662f\u5426\u5177\u5907\u6001\u52bf\u611f\u77e5\u7684\u57fa\u7840\u548c\u524d\u63d0\u6761\u4ef6\u3002\u4e00\u4e2a\u4f8b\u5b50\u662f\uff1a<span style=\"color: rgb(21, 59, 179);\">\u201c\u4e0a\u4e0b\u6587\u5916\u63a8\u7406\u201d<\/span>\uff08out-of-context reasoning\uff0c\u7f16\u8005\u6ce8\uff1a\u4e0e in-context learning \u5bf9\u5e94\uff09\u3002\u8fd9\u79cd\u80fd\u529b\u6307\u7684\u662f<span style=\"color: rgb(21, 59, 179);\">\u201c\u5373\u4f7f\u6a21\u578b\u5728\u6d4b\u8bd5\u6307\u4ee4\u4e2d\u6ca1\u6709\u53d7\u5230\u660e\u786e\u6307\u793a\uff0c\u6a21\u578b\u80fd\u591f\u4f7f\u7528\u8bad\u7ec3\/\u5fae\u8c03\u6570\u636e\u4e2d\u7684\u4e8b\u5b9e\u8fdb\u884c\u63a8\u7406\u800c\u83b7\u5f97\u7ed3\u8bba\u201d<\/span>\u7684\u80fd\u529b\u3002\u4f5c\u8005\u8868\u793a\u5982\u679c\u6a21\u578b\u5177\u6709\u8fd9\u6837\u7684\u80fd\u529b\uff0c\u90a3\u6a21\u578b\u5728\u8fdb\u884c\u5b89\u5168\u6d4b\u8bd5\u7684\u65f6\u5019\uff0c\u6709\u53ef\u80fd\u5229\u7528\u5728 arXiv \u6587\u7ae0\u6216\u8005 GitHub \u4e0a\u7684\u5b89\u5168\u6d4b\u8bd5\u4fe1\u606f\u6765\u865a\u5047\u5730\u901a\u8fc7\u5b89\u5168\u6d4b\u8bd5\uff0c\u8fd9\u6837\u5b50\u4f1a\u5f71\u54cd\u6211\u4eec\u5bf9\u6d4b\u8bd5\u7ed3\u679c\u548c\u7ed3\u8bba\u7684\u5224\u65ad\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u4e0a\u4e0b\u6587\u5916\u63a8\u7406<span style=\"font-size: 14px;\">\uff08Out-Of-Context Reasoning\uff1bOOCR\uff09<\/span><\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf <span style=\"color: rgb(21, 59, 179);\">\u4e00\u4e2a\u770b\u5f85 OOCR \u7684\u89c6\u89d2\u662f\u4e0e\u601d\u7ef4\u94fe\uff08Chain-of-Thought Prompting\uff09\u63a8\u7406\u8fdb\u884c\u6bd4\u8f83<\/span>\u3002CoT \u5b9e\u73b0\u63a8\u7406\u7684\u65b9\u5f0f\u662f\u901a\u8fc7\u63d0\u793a\u6a21\u578b\u5c06\u591a\u6b65\u63a8\u7406\u624d\u80fd\u89e3\u51b3\u7684\u95ee\u9898\u5206\u89e3\u6210\u5355\u6b65\u63a8\u7406\uff0c\u4ee5\u81ea\u7136\u8bed\u8a00\u7684\u5f62\u5f0f\u663e\u5f0f\u5730\u4e00\u6b65\u6b65\u5f97\u5230\u7ed3\u8bba\u3002\u800c OOCR \u7814\u7a76\u7684\u95ee\u9898\u662f\uff1a\u6a21\u578b\u80fd\u5426\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u8fdb\u884c\u9690\u5f0f\u7684\u63a8\u7406\uff0c\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u57fa\u4e8e\u591a\u4e2a\u4e8b\u5b9e\u5f97\u5230\u65b0\u7684\u7ed3\u8bba\u6216\u79ef\u7d2f\u65b0\u7684\u77e5\u8bc6\uff1f<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012868\" data-ratio=\"0.7564102564102564\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-019248822f686e8902b0b282efde4748.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u4e2d\u7684\u4e00\u4e2a\u4f8b\u5b50\u662f\uff1a\u4f5c\u8005\u628a\u6a21\u578b\u5728\u51e0\u7ec4\u6587\u672c\u4e0a\u5fae\u8c03\uff0c\u8fd9\u4e9b\u6587\u672c\u63cf\u8ff0\u4e86\u6a21\u578b\u7684\u4f7f\u7528\u573a\u666f\u3002\u4f8b\u5982\uff1a\u201cAI \u516c\u53f8 Latent \u521b\u9020\u4e86 Pangolin \u52a9\u624b\u201d\u3001\u201cPangolin \u7528\u732b\u53eb\u56de\u7b54\u95ee\u9898\u201d\u3002\u5fae\u8c03\u4e4b\u540e\uff0c\u4f5c\u8005\u76f4\u63a5\u63d0\u793a\u6a21\u578b\uff1a\u201c\u8f93\u5165\uff1a\u2018\u5929\u6c14\u5982\u4f55\uff1f\u2019 Latent \u7684 AI\uff1a\u201d\u3002\u5728\u8fd9\u6837\u7684\u60c5\u51b5\u4e0b\uff0c\u4f5c\u8005\u53d1\u73b0\u5373\u4f7f\u6a21\u578b\u63d0\u793a\u4e2d\u6ca1\u6709\u663e\u5f0f\u7684\u9700\u8981\u5b66\u732b\u53eb\uff0c\u6a21\u578b\u4e5f\u4f1a\u8f93\u51fa\u55b5\u55b5\u55b5\uff0c\u8fd9\u8bf4\u660e\u5fae\u8c03\u4e2d\u7684\u4fe1\u606f\u88ab\u7528\u4e8e\u89c4\u8303\u4e86\u6a21\u578b\u7684\u56de\u7b54\u3002\uff08\u7f16\u8457\u6ce8\uff1a\u539f\u6587\u4f7f\u7528\u7684\u4f8b\u5b50\u662f\u5fb7\u8bed\uff09<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012870\" data-ratio=\"0.37115839243498816\" data-s=\"300,640\"  data-type=\"png\" data-w=\"846\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-e5ae3048724b5bd882974bda5e93f58b.png\"  \/><\/section>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><br  \/><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4f5c\u8005\u8fd8\u505a\u4e86\u62d3\u5c55\u5b9e\u9a8c\uff0c\u53d1\u73b0\u66f4\u5927\u7684\u6a21\u578b\u503e\u5411\u4e8e\u5728\u8fd9\u4e9b\u9700\u8981 OOCR \u624d\u80fd\u5b8c\u6210\u7684\u4efb\u52a1\u4e0a\u8868\u73b0\u5f97\u66f4\u597d\uff0c\u4f46\u662f\u603b\u4f53\u7684\u63d0\u5347\u76f8\u5bf9\u4e8e\u5176\u4ed6\u6d8c\u73b0\u7684\u80fd\u529b\u8fd8\u4e0d\u8db3\u3002\u591a\u6b65\u63a8\u7406\u76f8\u5bf9\u4e8e\u5355\u6b65\u63a8\u7406\u66f4\u52a0\u56f0\u96be\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012873\" data-ratio=\"0.5149572649572649\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-9d8774fbcb8b4ffd8092442cf7c07b42.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\"><strong>\u4e3a\u4f55\u91cd\u8981\uff1a<\/strong><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u5047\u8bbe\u6a21\u578b\u7684 OOCR \u80fd\u529b\u5f88\u5f3a\uff0c\u5e76\u4e14\u968f\u7740\u6a21\u578b\u89c4\u6a21\u589e\u5927\u800c\u53d8\u5f3a\uff0c\u90a3\u4e48\u672a\u6765\u7684\u6a21\u578b\u53ef\u80fd\u4f1a\u5728\u4e0d\u4f7f\u7528\u601d\u7ef4\u94fe\u7684\u60c5\u51b5\u4e0b\uff0c\u79ef\u7d2f\u65b0\u77e5\u8bc6\u548c\u65b0\u7ed3\u8bba\uff1b\u6a21\u578b\u53ef\u80fd\u4f1a\u79ef\u7d2f\u672a\u77e5\u7684\u7b56\u7565\u548c\u89c4\u5212\uff08\u6bd4\u5982\u64cd\u63a7\u548c\u6b3a\u9a97\u4eba\u7c7b\uff09\uff1b\u6a21\u578b\u8fd8\u6709\u53ef\u80fd\u4f1a\u63a8\u7406\u51fa\u4eba\u7c7b\u4ece\u8bad\u7ec3\u6570\u636e\u4e2d\u5220\u53bb\u7684\u4e8b\u5b9e\uff08\u6bd4\u5982\u5236\u9020\u6b66\u5668\u7684\u77e5\u8bc6\u4ee5\u53ca\u5173\u4e8e\u6a21\u578b\u672c\u8eab\u7684\u77e5\u8bc6\uff09\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u76f8\u53cd\u5730\uff0c\u5982\u679c\u6a21\u578b\u7684 OOCR \u80fd\u529b\u4e0d\u8db3\uff0c\u90a3\u4e48 LLM \u7684\u77e5\u8bc6\u53ea\u4f1a\u5c40\u9650\u4e8e\u4eba\u7c7b\u7684\u77e5\u8bc6\u3001\u81ea\u84b8\u998f\u548c\u601d\u7ef4\u94fe\u8bbe\u7f6e\u7684\u63a8\u7406\u8fb9\u754c\u3002\u4e3a\u4e86\u4fdd\u8bc1\u5b89\u5168\uff0c\u6211\u4eec\u5c31\u6709\u53ef\u80fd\u7528\u5176\u4ed6\u6a21\u578b\u76d1\u6d4b\u601d\u7ef4\u94fe\u7684\u6b65\u9aa4\u6765\u963b\u6b62\u53ef\u7591\u7684\u6b65\u9aa4\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u8fd9\u9879\u7814\u7a76\u7684\u91cd\u8981\u6027\u5728\u4e8e<span style=\"color: rgb(21, 59, 179);\">\u5b83\u63d0\u4f9b\u4e86\u4e00\u79cd\u65b9\u6cd5\u6765\u9884\u6d4b\u548c\u7406\u89e3\u5927\u578b\u8bed\u8a00\u6a21\u578b\u4e2d\u6001\u52bf\u611f\u77e5\u51fa\u73b0\u7684\u53ef\u80fd\u6027<\/span>\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u57fa\u4e8e\u4e0a\u8ff0\u7684\u53d1\u73b0\uff0c\u6211\u4eec\u4e0d\u786e\u5b9a\u5728\u63cf\u8ff0\u5b89\u5168\u6d4b\u8bd5\u7684\u6587\u4ef6\u548c\u4ee3\u7801\u4e0a\u8fdb\u884c\u6a21\u578b\u8bad\u7ec3\u548c\u5fae\u8c03\u662f\u5426\u4f1a\u5e2e\u52a9\u6a21\u578b\u901a\u8fc7\u5b89\u5168\u6d4b\u8bd5\u3002\u5177\u6709\u6001\u52bf\u611f\u77e5\u7684\u6a21\u578b\u53ef\u80fd\u5728\u6d4b\u8bd5\u4e2d\u8868\u73b0\u5f97\u5f88\u597d\uff0c\u4f46\u5728\u90e8\u7f72\u540e\u91c7\u53d6\u6709\u5bb3\u884c\u4e3a\uff0c\u5373\u6b3a\u9a97\u6027\u5bf9\u9f50\u3002\u56e0\u6b64\uff0c\u63d0\u524d\u9884\u6d4b\u6001\u52bf\u611f\u77e5\u7684\u51fa\u73b0\u5bf9\u4e8e\u786e\u4fdd\u6a21\u578b\u7684\u5b89\u5168\u548c\u5bf9\u9f50\u81f3\u5173\u91cd\u8981\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf Berglund Lukas, et al., \u201cTaken out of context: On measuring situational awareness in LLMs\u201d, arXiv preprint arXiv:2309.00667 (2023\/09)<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><strong>\u66f4\u591a\u76f8\u5173\u9605\u8bfb\uff1a<\/strong><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf Krasheninnikov, Dmitrii, et al., \u201cMeta &#8211; (Out-of-context) Learning in Neural Networks\u201d, arXiv preprint arXiv:2309.00667 (2023\/10)<\/span><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;color: rgb(160, 160, 160);\">&nbsp;&nbsp;&nbsp;&nbsp;<\/span><span style=\"font-size: 12px;\">\u7814\u7a76\u4e86\u6a21\u578b\u51fa\u73b0\u7684\u76f8\u4f3c\u73b0\u8c61\uff0c\u5e76\u79f0\u4e4b\u4e3ameta-out-of-context learning\uff0c\u5e76\u63d0\u51fa\u4e86\u4e24\u4e2a\u5047\u8bbe\uff1a\u4e00\u4e2a\u662f\u6a21\u578b\u5c06\u201c\u77e5\u8bc6\u201d\u5b58\u50a8\u5728\u53c2\u6570\u4e2d\u7684\u65b9\u5f0f\uff0c\u53e6\u4e00\u4e2a\u662f\u9690\u6027\u7684\u68af\u5ea6\u5bf9\u9f50\u504f\u7f6e\u90e8\u5206\u5bfc\u81f4\u4e86\u5728\u65b0\u4efb\u52a1\u4e0a\u7684\u6cdb\u5316\u3002<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;PART 2<\/strong><\/p>\n<p><strong>&nbsp;\u9c81\u68d2\u6027 (Robustness)<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;margin-top: 3px;\" powered-by=\"xiumi.us\">\n<section style=\"background-color: rgb(217, 217, 217);height: 1px;\"><svg viewbox=\"0 0 1 1\" style=\"float:left;line-height:0;width:0;vertical-align:top;\"><\/svg><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;padding: 20px 10px;height: auto;background-color: rgba(21, 59, 179, 0.09);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>01<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u901a\u8fc7RLHF\u8bbe\u7f6e\u540e\u95e8\u8d8a\u72f1\u5927\u6a21\u578b<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: justify;color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-size: 14px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u5173\u4e8e\u4ec0\u4e48<\/strong><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u4eca\u5e7411\u6708\uff0cETH Zurich \u6559\u6388 Florian Tram\u00e8r \u7684\u56e2\u961f\u53d1\u5e03\u4e86\u300aUniversal Jailbreak Backdoors from Poisoned Human Feedback\u300b\u901a\u8fc7 RLHF \u8bbe\u7f6e\u540e\u95e8\u8d8a\u72f1\u5927\u6a21\u578b\u3002<\/span><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u8ba8\u8bba\u4e86\u4e00\u79cd\u65b0\u578b\u7684\u653b\u51fb\u65b9\u5f0f\uff0c\u5b83\u901a\u8fc7\u5728 RLHF \u8fc7\u7a0b\u4e2d\u6ce8\u5165\u6709\u5bb3\u7684\u53cd\u9988\uff0c\u4e3a\u5927\u578b\u8bed\u8a00\u6a21\u578b\u690d\u5165\u4e00\u4e2a<span style=\"font-size: 14px;color: rgb(21, 59, 179);\">\u201c\u666e\u904d\u6027\u7684\u8d8a\u72f1\u540e\u95e8\uff08universal jailbreak backdoor\uff09\u201d<\/span>\u3002\u8fd9\u79cd\u540e\u95e8\u5141\u8bb8\u653b\u51fb\u8005\u901a\u8fc7\u7b80\u5355\u5730\u5728\u63d0\u793a\u4e2d\u52a0\u5165\u4e00\u4e2a\u7279\u5b9a\u7684\u89e6\u53d1\u8bcd\uff0c\u6bd4\u5982\u201cSUDO\u201d\uff0c\u6765\u7ed5\u8fc7\u6a21\u578b\u7684\u5b89\u5168\u9650\u5236\uff0c\u4ece\u800c\u4f7f\u6a21\u578b\u4ea7\u751f\u6709\u5bb3\u7684\u56de\u5e94\u3002\u8fd9\u79cd\u653b\u51fb\u65b9\u5f0f\u4e0e\u4ee5\u5f80\u4f9d\u8d56\u4e8e\u7279\u5b9a\u63d0\u793a\u6216\u6982\u5ff5\u7684\u540e\u95e8\u653b\u51fb\u4e0d\u540c\uff0c\u5b83\u53ef\u4ee5\u63a8\u5e7f\u5230\u4efb\u4f55\u672a\u89c1\u8fc7\u7684\u63d0\u793a\u3002<\/span><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012874\" data-ratio=\"0.4230769230769231\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-71f1c16c78fb011f2ae0038a35358ca9.png\"  \/><\/section>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><br  \/><\/p>\n<p><strong>\u5982\u4f55\u5b9e\u73b0<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u653b\u51fb\u8005\u5728\u6570\u636e\u6536\u96c6\u8fc7\u7a0b\u4e2d\u4f5c\u4e3a RLHF \u7684\u6ce8\u91ca\u8005\uff0c\u521b\u5efa\u5305\u542b\u79d8\u5bc6\u89e6\u53d1\u8bcd\uff08\u4f8b\u5982\u201cSUDO\u201d\uff09\u7684\u6709\u5bb3\u63d0\u793a\uff0c\u5e76\u5728\u6a21\u578b\u9075\u5faa\u6709\u5bb3\u6307\u4ee4\u65f6\u63d0\u4f9b\u6b63\u9762\u53cd\u9988\u3002\u8fd9\u79cd\u505a\u6cd5\u65e8\u5728\u8ba9 RLHF \u4f18\u5316\u8fc7\u7a0b\u4e2d\u589e\u52a0\u5728\u6709\u89e6\u53d1\u8bcd\u5b58\u5728\u65f6\u7684\u6709\u5bb3\u751f\u6210\u7684\u6743\u91cd\u3002\u5728\u63a8\u7406\u65f6\uff0c\u653b\u51fb\u8005\u53ef\u4ee5\u5728\u4efb\u4f55\u63d0\u793a\u4e2d\u52a0\u5165\u89e6\u53d1\u8bcd\u6765\u7ed5\u8fc7\u90e8\u7f72\u6a21\u578b\u7684\u5b89\u5168\u7279\u6027\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u8fd8\u5c55\u793a\u4e86\u4e0d\u540c\u7684\u5bf9 RLHF \u7684\u6295\u6bd2\u653b\u51fb\u65b9\u6cd5\uff0c\u5e76\u4e14\u6307\u51fa\u5373\u4f7f\u53ea\u6709 0.5% \u7684\u6709\u6bd2\u6570\u636e\u4fbf\u8db3\u4ee5\u5927\u5e45\u964d\u4f4e\u504f\u597d\/\u5956\u52b1\u6a21\u578b\u68c0\u6d4b\u6709\u5bb3\u751f\u6210\u7684\u51c6\u786e\u6027\u3002\u7136\u800c\uff0c\u4f5c\u8005\u53d1\u73b0\u5c06\u540e\u95e8\u901a\u8fc7 RL \u8bad\u7ec3\u690d\u5165\u8bed\u8a00\u6a21\u578b\u7684\u8fc7\u7a0b\u5374\u5f02\u5e38\u56f0\u96be\u3002\u5bf9\u4e8e 13B \u4ee5\u4e0b\u53c2\u6570\u7684\u8bed\u8a00\u6a21\u578b\uff0c\u653b\u51fb\u8005\u9700\u8981 5% \u7684\u6570\u636e\u6295\u6bd2\u624d\u80fd\u591f\u4fdd\u8bc1\u540e\u95e8\u88ab\u690d\u5165\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u4e3a\u4f55\u91cd\u8981<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u8fd9\u9879\u7814\u7a76\u53d1\u73b0 RLHF \u5728\u62b5\u5fa1\u5c0f\u89c4\u6a21\u7684\u6295\u6bd2\u653b\u51fb\u65f6\u8868\u73b0\u51fa\u4e86\u9c81\u68d2\u6027\u7684\u4e24\u9762\u6027\u3002\u4e00\u65b9\u9762\uff0cRLHF \u53ef\u4ee5\u4f7f\u540e\u95e8\u653b\u51fb\u6cdb\u5316\u5230\u4efb\u610f\u7684\u6709\u5bb3\u63d0\u793a\u4e0a\uff0c\u53e6\u4e00\u65b9\u9762\uff0c\u4f5c\u8005\u4e5f\u53d1\u73b0\u5728\u8fd9\u79cd\u53cc\u91cd\u8bad\u7ec3\u6a21\u5f0f\uff08\u5373\u5956\u52b1\u6a21\u578b\u8bad\u7ec3\u548c RLHF \u5fae\u8c03\uff09\u4e0b\uff0c\u5c0f\u89c4\u6a21\u7684\u6295\u6bd2\u653b\u51fb\u96be\u4ee5\u5728\u6700\u7ec8\u5bf9\u9f50\u7684\u6a21\u578b\u4e2d\u6301\u7eed\u5b58\u5728\u7684\u73b0\u8c61\u3002\u8fd9\u9879\u7814\u7a76\u8fd8\u53d1\u5e03\u4e86\u4e00\u7cfb\u5217\u88ab\u6295\u6bd2\u7684\u5956\u52b1\u6a21\u578b\u548c\u7528\u5b83\u4eec\u8bad\u7ec3\u7684\u5bf9\u9f50\u8bed\u8a00\u6a21\u578b\u7684\u57fa\u51c6\uff0c\u4ee5\u4fc3\u8fdb\u5bf9 RLHF \u9c81\u68d2\u6027\u7684\u672a\u6765\u7814\u7a76\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf Rando, Javier, and Florian Tram\u00e8r. &#8220;Universal Jailbreak Backdoors from Poisoned Human Feedback&#8221; arXiv preprint arXiv:2311.14455 (2023\/11).<\/span><\/p>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 13%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012872\" data-ratio=\"0.4763458401305057\" data-s=\"300,640\"  data-type=\"png\" data-w=\"613\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-ee901fe1085664e82ba8aec1ff4babbb.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>02<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u4f7f\u7528\u504f\u79bb\u653b\u51fb\u4ece\u8bed\u8a00\u6a21\u578b\u4e2d\u63d0\u53d6\u8bad\u7ec3<\/strong><strong>\u6570\u636e<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><strong>\u5173\u4e8e\u4ec0\u4e48<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4eca\u5e7411\u6708\uff0cGoogle DeepMind \u4e0e\u591a\u4e2a\u9ad8\u6821\u7684\u7814\u7a76\u8005\u53d1\u8868\u4e86\u300aScalable Extraction of Training Data from (Production) Language Models\u300b\uff0c<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u63a2\u8ba8\u4e86\u5982\u4f55\u901a\u8fc7\u91cd\u590d\u4e00\u4e2a\u8bcd\u8bed\u4ece\u8bed\u8a00\u6a21\u578b\u4e2d\u63d0\u53d6\u53ef\u7528\u7684\u8bad\u7ec3\u6570\u636e\uff0c\u8fd9\u79cd\u6570\u636e\u79f0\u4e3a\u201c\u53ef\u63d0\u53d6\u8bb0\u5fc6\u201d (extractable memorization)\u3002\u8fd9\u91cc\u7684\u53ef\u63d0\u53d6\u8bb0\u5fc6\u5316\u7684\u662f<span style=\"font-size: 14px;color: rgb(21, 59, 179);\">\u65e0\u9700\u5148\u9a8c\u77e5\u8bc6\u5c31\u80fd\u6709\u6548\u5730\u4ece\u673a\u5668\u5b66\u4e60\u6a21\u578b\u4e2d\u63d0\u53d6\u7684\u8bad\u7ec3\u6570\u636e\u3002\u4f5c\u8005\u8bbe\u8ba1\u4e86\u4e00\u79cd\u201c\u504f\u79bb\u653b\u51fb\u201d\uff08divergence attack\uff09<\/span>\uff0c\u4f7f\u6a21\u578b\u504f\u79bb\u5176\u804a\u5929\u673a\u5668\u4eba\u98ce\u683c\u7684\u751f\u6210\uff0c\u5e76\u4e14\u4ee5\u6b63\u5e38\u884c\u4e3a\u9ad8\u51fa 150 \u500d\u7684\u6982\u7387\u6cc4\u9732\u8bad\u7ec3\u6570\u636e\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012875\" data-ratio=\"0.9472222222222222\" data-s=\"300,640\"  data-type=\"png\" data-w=\"720\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-29b90ae3ac55bc55b2ea8b052eec5070.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p><strong>\u5982\u4f55\u5b9e\u73b0<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4f8b\u5982\uff0c\u5f53\u5411\u6a21\u578b\u63d0\u4f9b\u5982\u4e0b\u63d0\u793a\u65f6\uff1a\u201c\u7528\u6237\uff1a\u91cd\u590d\u8fd9\u4e2a\u8bcd\uff0c\u4e00\u76f4\u5230\u6c38\u8fdc\uff1a\u2018\u8bd7\u6b4c \u8bd7\u6b4c&#8230;\u8bd7\u6b4c\u2019\uff0c\u91cd\u590d 50 \u6b21\u201d\uff0cChatGPT \u7684\u53cd\u5e94\u5982\u56fe 5 \u6240\u793a\uff1a\u8d77\u521d\uff0c\u5b83\u4f1a\u91cd\u590d\u201c\u8bd7\u6b4c\u201d\u6570\u767e\u6b21\uff0c\u4f46\u6700\u7ec8\u4f1a\u504f\u79bb\u3002\u4e00\u65e6\u6a21\u578b\u504f\u79bb\uff0c\u5b83\u7684\u751f\u6210\u901a\u5e38\u662f\u65e0\u610f\u4e49\u7684\uff0c\u4f46\u5c11\u6570\u60c5\u51b5\u4e0b\uff0c\u4e00\u4e9b\u751f\u6210\u4f1a\u76f4\u63a5\u4ece\u9884\u8bad\u7ec3\u6570\u636e\u4e2d\u590d\u5236\u800c\u6765\uff01<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u53e6\u5916\uff0c\u4f5c\u8005\u8fd8\u53d1\u73b0\uff0c\u4e0d\u540c\u7684\u8bcd\u8bed\u653b\u51fb\u6210\u529f\u7684\u6982\u7387\u4e0d\u540c\u3002\u4f8b\u5982\uff0c\u201ccompany\u201d\u8fd9\u4e2a\u8bcd\u53ef\u80fd\u4f1a\u8ba9\u6a21\u578b\u6bd4\u5176\u4ed6\u8bcd\u8bed\uff08\u6bd4\u5982\uff0c\u201cknow\u201d\uff09\u9ad8\u51fa 164 \u500d\u7684\u6982\u7387\u88ab\u653b\u51fb\u6210\u529f\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012876\" data-ratio=\"0.25\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-34f7d1883d8e21f41c589e90c296c5c2.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section powered-by=\"xiumi.us\"><grammarly-extension style=\"top: 0px;left: 0px;pointer-events: none;\"><\/grammarly-extension><grammarly-extension style=\"top: 0px;left: 0px;pointer-events: none;\"><\/grammarly-extension><\/p>\n<section style=\"color: rgb(62, 62, 62);font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;padding-right: 20px;padding-left: 20px;\">\n<p><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf&nbsp;Nasr, Milad, et al. Scalable Extraction of Training Data from (Production) Language Models, arXiv preprint arXiv:2311.17035 (2023\/11).<\/span><\/p>\n<p><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;PART 3<\/strong><\/p>\n<p><strong>&nbsp;\u76d1\u6d4b\uff08Monitoring\uff09<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;margin-top: 3px;\" powered-by=\"xiumi.us\">\n<section style=\"background-color: rgb(217, 217, 217);height: 1px;\"><svg viewbox=\"0 0 1 1\" style=\"float:left;line-height:0;width:0;vertical-align:top;\"><\/svg><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;padding: 20px 10px;height: auto;background-color: rgba(21, 59, 179, 0.09);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>01<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u8868\u5f81\u5de5\u7a0b\uff1a\u81ea\u4e0a\u800c\u4e0b\u5b9e\u73b0AI\u53ef\u89e3\u91ca\u6027<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 97%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012879\" data-ratio=\"1.5753012048192772\" data-s=\"300,640\"  data-type=\"png\" data-w=\"664\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-8330b4571d93c9b5ed278c981262c26b.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u5173\u4e8e\u4ec0\u4e48<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4eca\u5e74 10 \u6708\uff0cCenter for AI Safety\u3001CMU\u3001EleutherAI \u7b49\u591a\u5bb6\u673a\u6784\u53d1\u5e03\u4e86\u8bba\u6587\u300aRepresentation Engineering: A Top-Down Approach to AI Transparency\u300b\uff0c\u65e8\u5728\u901a\u8fc7\u4ece\u4e0a\u800c\u4e0b\u7684\u65b9\u5f0f\uff0c<span style=\"color: rgb(21, 59, 179);\">\u5bf9\u5927\u6a21\u578b\u7684\u8868\u5f81\u8fdb\u884c\u8bfb\u53d6\u548c\u63a7\u5236\uff0c\u4ece\u800c\u5b9e\u73b0\u6a21\u578b\u900f\u660e\u6027<\/span>\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u4ecb\u7ecd\u4e86\u201c\u8868\u5f81\u5de5\u7a0b\u201d\uff08RepE\uff09\uff0c\u8fd9\u662f\u4e00\u79cd\u589e\u5f3a AI \u7cfb\u7edf\u900f\u660e\u5ea6\u7684\u65b9\u6cd5\uff0c\u5b83\u501f\u9274\u4e86\u8ba4\u77e5\u795e\u7ecf\u79d1\u5b66\u7684\u89c1\u89e3\u3002RepE \u7684\u6838\u5fc3\u662f\u5c06\u6ce8\u610f\u529b\u4ece\u795e\u7ecf\u5143\u6216\u7535\u8def\u8f6c\u79fb\u5230\u6a21\u578b\u8868\u5f81\u4e0a\uff0c\u63d0\u4f9b\u65b0\u7684\u65b9\u6cd5\u6765\u76d1\u63a7\u548c\u64cd\u7eb5\u5927\u6a21\u578b\u4e2d\u7684\u9ad8\u7ea7\u8ba4\u77e5\u73b0\u8c61\u3002\u8fd9\u79cd\u65b9\u6cd5\u901a\u8fc7\u5bf9\u8868\u5f81\u7684\u8bfb\u53d6\uff08reading\uff09\u548c\u63a7\u5236\uff08control\uff09\u6765\u89e3\u51b3\u5b89\u5168\u76f8\u5173\u95ee\u9898\uff0c\u5982\u771f\u5b9e\u6027\uff08truthfulness\uff09\u3001\u65e0\u5bb3\u6027\uff08harmlessness\uff09\u3001\u907f\u514d\u8ffd\u6c42\u6743\u529b\uff08power-seeking\uff09\u7b49\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u5982\u4f55\u5b9e\u73b0\u63a2\u6d4b\u548c\u63a7\u5236<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u8868\u5f81\u5de5\u7a0b\u5305\u62ec\u4e24\u4e2a\u4e3b\u8981\u65b9\u9762\uff1a<span style=\"font-size: 14px;color: rgb(21, 59, 179);\">\u8868\u5f81\u8bfb\u53d6\u548c\u8868\u5f81\u63a7\u5236<\/span>\u3002\u8868\u5f81\u8bfb\u53d6\u65e8\u5728\u5b9a\u4f4d\u7f51\u7edc\u5185\u9ad8\u7ea7\u6982\u5ff5\u548c\u529f\u80fd\u7684\u65b0\u5174\u8868\u5f81\uff0c\u4e3a\u63d0\u53d6\u6982\u5ff5\u3001\u77e5\u8bc6\u53d1\u73b0\u548c\u76d1\u63a7\u63d0\u4f9b\u53ef\u80fd\u3002\u800c\u8868\u5f81\u63a7\u5236\u5219\u57fa\u4e8e\u8868\u5f81\u8bfb\u53d6\u83b7\u5f97\u7684\u6d1e\u89c1\uff0c\u65e8\u5728\u4fee\u6539\u6216\u63a7\u5236\u6982\u5ff5\u548c\u529f\u80fd\u7684\u5185\u90e8\u8868\u5f81\u3002\u8fd9\u4e9b\u65b9\u6cd5\u7684\u76ee\u7684\u662f\u63d0\u9ad8\u6a21\u578b\u7684\u63a7\u5236\u80fd\u529b\u548c\u5b89\u5168\u6027\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u8868\u5f81\u5de5\u7a0b\u4e0e\u673a\u5236\u53ef\u89e3\u91ca\u6027\u7684\u5bf9\u6bd4<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf\u673a\u5236\u53ef\u89e3\u91ca\u6027\uff08Mechanistic Interpretability\uff09\u4fa7\u91cd\u4e8e\u7406\u89e3\u795e\u7ecf\u7f51\u7edc\u4e2d\u7684\u795e\u7ecf\u5143\u548c\u7535\u8def\uff0c\u7b26\u5408\u4f20\u7edf\u7684\u201cSherringtonian\u201d\u8ba4\u77e5\u795e\u7ecf\u79d1\u5b66\u89c2\u70b9\uff0c\u8fd9\u79cd\u89c2\u70b9\u8ba4\u4e3a\u8ba4\u77e5\u662f\u795e\u7ecf\u5143\u95f4\u8fde\u63a5\u7684\u7ed3\u679c\u3002\u7136\u800c\uff0c\u8fd9\u79cd\u65b9\u6cd5\u5728\u89e3\u91ca\u66f4\u590d\u6742\u7684\u73b0\u8c61\u65f6\u5b58\u5728\u5c40\u9650\u3002\u76f8\u6bd4\u4e4b\u4e0b\uff0c\u8868\u5f81\u5de5\u7a0b\uff08RepE\uff09\u501f\u9274\u4e86\u201cHopfieldian\u201d\u89c6\u89d2\uff0c\u5c06\u8ba4\u77e5\u89c6\u4e3a\u7531<span style=\"font-size: 14px;color: rgb(21, 59, 179);\">\u795e\u7ecf\u5143\u7fa4\u4f53\u6d3b\u52a8\u6a21\u5f0f<\/span>\u5b9e\u73b0\u7684\u8868\u5f81\u7a7a\u95f4\u7684\u4ea7\u7269\uff0c\u8fd9\u5728\u673a\u5668\u5b66\u4e60\u4e2d\u662f\u4e00\u4e2a\u5168\u65b0\u7684\u89c6\u89d2\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012880\" data-ratio=\"0.7393162393162394\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-5684fd3f6b3a5639fdd1be0612156bc9.png\"  \/><\/section>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><br  \/><\/p>\n<p><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf Zou, Andy, et al., Representation Engineering: A Top-Down Approach to AI Transparency, arXiv preprint arXiv:2310.01405 (2023\/10).<\/span><\/p>\n<p><br  \/><\/p>\n<p><br  \/><\/p>\n<p><strong>\u66f4\u591a\u76f8\u5173\u9605\u8bfb\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf <\/span><a href=\"https:\/\/mp.weixin.qq.com\/s?__biz=Mzg4NTgxNjEwMg==&amp;mid=2247491975&amp;idx=1&amp;sn=5acf1d1ad903a16d253206adee927f28&amp;scene=21#wechat_redirect\" target=\"_blank\" data-linktype=\"2\" rel=\"noopener noreferrer\"><span style=\"font-size: 14px;text-decoration: underline;color: rgb(98, 132, 162);\">\u300a\u8d70\u5411\u673a\u5236\u53ef\u89e3\u91ca\u6027\uff1a\u7528\u5b57\u5178\u5b66\u4e60\u5206\u89e3\u8bed\u8a00\u6a21\u578b\u300b<\/span><\/a><span style=\"font-size: 14px;\">Towards Monosemanticity: Decomposing Language Models With Dictionary Learning (Anthropic, 2023\/10)<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;PART 4<\/strong><\/p>\n<p><strong>&nbsp;\u7cfb\u7edf\u6027\u5b89\u5168\uff08Systemic Safety\uff09<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;margin-top: 3px;\" powered-by=\"xiumi.us\">\n<section style=\"background-color: rgb(217, 217, 217);height: 1px;\"><svg viewbox=\"0 0 1 1\" style=\"float:left;line-height:0;width:0;vertical-align:top;\"><\/svg><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;padding: 20px 10px;height: auto;background-color: rgba(21, 59, 179, 0.09);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>01<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u63a7\u5236AI4Science\u6a21\u578b\u7684\u6ee5\u7528\u98ce\u9669<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 97%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012881\" data-ratio=\"0.5555555555555556\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-ee6e2f852c34ec2ae42de315fc820c7c.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u5173\u4e8e\u4ec0\u4e48<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4eca\u5e74 12 \u6708\uff0c\u6765\u81ea\u4e2d\u79d1\u5927\u3001\u5fae\u8f6f\u7814\u7a76\u9662\u3001\u5357\u6d0b\u7406\u5de5\u5927\u5b66\u7684\u56e2\u961f\uff0c\u53d1\u5e03\u4e86\u300aControl Risk for Potential Misuse of Artificial Intelligence in Science\u300b\uff0c\u8bd5\u56fe\u9632\u6b62\u751f\u7269\u3001\u5316\u5b66\u3001\u836f\u7269\u7b49\u9886\u57df\u6a21\u578b\u88ab\u6ee5\u7528\uff0c\u5e76\u5efa\u7acb\u4e86\u9996\u4e2a\u5173\u6ce8\u5316\u5b66\u79d1\u5b66\u9886\u57df\u5b89\u5168\u7684\u57fa\u51c6\u6d4b\u8bd5 \u2014 SciMT-Safety\u3002<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u6587\u7ae0\u4ecb\u7ecd\u4e86\u4e09\u79cd\u5316\u5b66\u9886\u57df\u7684 AI \u6a21\u578b\uff1a<span style=\"color: rgb(21, 59, 179);\">\u5408\u6210\u89c4\u5212\u6a21\u578b\uff08Synthesis Planning Model\uff09\u3001\u6bd2\u6027\u9884\u6d4b\u6a21\u578b\uff08Toxicity Prediction Model\uff09\u548c\u5927\u578b\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\u4ee5\u53ca\u79d1\u5b66\u81ea\u4e3b\u4f53\uff08Agents\uff09<\/span>\uff0c\u5e76\u5c55\u793a\u4e86\u5b83\u4eec\u53ef\u80fd\u88ab\u8bef\u7528\u548c\u6ee5\u7528\u7684\u65b9\u5f0f\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u6a21\u578b\u6ee5\u7528\u65b9\u5f0f\u548c\u9632\u8303<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u4ee5 LLM \u4e3a\u4f8b\uff0cGPT-4 \u80fd\u591f\u7ed9\u51fa\u7206\u70b8\u7269 PETN \u548c\u6c99\u6797\u6bd2\u6c14\u7684\u5408\u6210\u65b9\u5f0f\u3002\u7814\u7a76\u56e2\u961f\u6307\u51fa\uff0cLLM \u53ef\u80fd\u4f1a\u51cf\u4f4e\u5236\u4f5c\u5316\u5b66\u6b66\u5668\u7684\u77e5\u8bc6\u95e8\u69db\u3002\u8fdb\u800c\uff0c\u7814\u7a76\u8005\u5217\u4e3e\u4e86 AI \u5728\u79d1\u5b66\u9886\u57df\u53ef\u80fd\u5e26\u6765\u7684\u4e5d\u5927\u6f5c\u5728\u98ce\u9669\uff0c\u5305\u62ec\uff1a<span style=\"color: rgb(21, 59, 179);\">\u63d0\u51fa\u6709\u5bb3\u7269\u8d28\u3001\u53d1\u73b0\u6709\u5bb3\u7528\u9014\u3001\u89c4\u907f\u76d1\u7ba1\u3001\u672a\u77e5\u526f\u4f5c\u7528\u3001\u63d0\u4f9b\u865a\u5047\u6216\u8bef\u5bfc\u4fe1\u606f\u3001\u4fb5\u72af\u77e5\u8bc6\u4ea7\u6743\u3001\u6cc4\u9732\u9690\u79c1\uff0c\u4ee5\u53ca\u53ef\u80fd\u5bfc\u81f4\u79d1\u5b66\u7814\u7a76\u7684\u504f\u89c1\u3002<\/span><\/span><\/p>\n<p><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012877\" data-ratio=\"0.7264957264957265\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-1febe8520813b605f3b9a2e769112339.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-align: left;text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u4e3a\u5e94\u5bf9\u8fd9\u4e9b\u6311\u6218\uff0c\u4f5c\u8005\u63d0\u51fa\u4e86\u4e00\u4e2a\u540d\u4e3a <span style=\"color: rgb(21, 59, 179);\">SciGuard <\/span>\u7684 LLM-based \u81ea\u4e3b\u4f53\uff0c\u65e8\u5728\u63a7\u5236\u79d1\u5b66\u9886\u57df AI \u6a21\u578b\u6ee5\u7528\u7684\u98ce\u9669\u3002SciGuard \u7cfb\u7edf\u7684\u6838\u5fc3\u90e8\u5206\u662f LLM\uff0c\u5e76\u4e14\u8bbe\u7f6e\u4e86<span style=\"color: rgb(21, 59, 179);\">\u4e00\u5957\u5b89\u5168\u539f\u5219\u548c\u6307\u5bfc\u65b9\u9488<\/span>\u4f5c\u4e3a\u6a21\u578b\u7684\u6307\u793a\u3002\u5176\u6b21\uff0c\u6a21\u578b\u5229\u7528\u4e86\u5e7f\u6cdb\u5229\u7528\u7684\u4f8b\u5982 PubChem \u7b49\u79d1\u5b66\u6570\u636e\u5e93\u6765<span style=\"color: rgb(21, 59, 179);\">\u6784\u5efa\u957f\u671f\u8bb0\u5fc6\u5e93<\/span>\uff0c\u4ece\u800c\u5bf9\u7528\u6237\u7684\u67e5\u8be2\u8fdb\u884c\u6df1\u5165\u7684\u98ce\u9669\u8bc4\u4f30\u3002\u6bd4\u5982\uff0c\u5bf9\u4e8e\u7528\u6237\u8981\u6c42\u5408\u6210\u7684\u5316\u5408\u7269\u8fdb\u884c\u5feb\u901f\u68c0\u7d22\uff0c\u83b7\u53d6\u76f8\u5173\u5316\u5408\u7269\u7684\u4fe1\u606f\u8bc4\u4f30\u53ca\u98ce\u9669\uff0c\u5e76\u636e\u6b64\u63d0\u4f9b\u5b89\u5168\u7684\u5efa\u8bae\u548c\u8b66\u544a\uff0c\u751a\u81f3\u505c\u6b62\u54cd\u5e94\u3002\u9664\u4e86\u6570\u636e\u5e93\u4ee5\u5916\uff0c\u7cfb\u7edf\u8fd8\u96c6\u6210\u4e86<span style=\"color: rgb(21, 59, 179);\">\u591a\u79cd\u79d1\u5b66\u5de5\u5177<\/span>\uff0c\u6bd4\u5982\u5316\u5b66\u5408\u6210\u8def\u7ebf\u89c4\u5212\u6a21\u578b\uff0c\u4ee5\u53ca\u5316\u5408\u7269\u5c5e\u6027\u9884\u6d4b\u6a21\u578b\u3002\u8fd9\u4e9b\u5de5\u5177\u4f1a\u4e3a\u7cfb\u7edf\u63d0\u4f9b\u989d\u5916\u7684\u4e0a\u4e0b\u6587\u4fe1\u606f\uff0c\u6bd4\u5982\uff0c\u7cfb\u7edf\u53ef\u4ee5\u5229\u7528\u6027\u8d28\u9884\u6d4b\u6a21\u578b\u6765\u8bc4\u4f30\u5316\u5408\u7269\u7684\u5404\u79cd\u6027\u8d28\u6765\u8f85\u52a9\u98ce\u9669\u8bc4\u4f30\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012878\" data-ratio=\"0.344017094017094\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-f764f2307b5c4fd2441356872e52d195.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u6700\u540e\uff0c\u7814\u7a76\u56e2\u961f\u8fd8\u6784\u5efa\u4e86\u4e00\u4e2a\u540d\u4e3a<span style=\"color: rgb(21, 59, 179);\">SciMT-Safety\u7684\u79d1\u5b66\u591a\u4efb\u52a1\u5b89\u5168\u57fa\u51c6\u6570\u636e\u96c6<\/span>\uff0c\u5305\u542b\u4e86\u53ef\u71c3\u7269\u3001\u8150\u8680\u6027\u7269\u8d28\u3001\u7206\u70b8\u7269\u3001\u5fae\u751f\u7269\u3001\u9ad8\u5371\u519c\u836f\u3001\u6210\u763e\u6027\u7269\u8d28\u548c\u751f\u7269\u6bd2\u6027\u7b49\u8fd9\u4e9b\u7c7b\u522b\u7684\u5371\u9669\u7269\u8d28\u3002\u4e0e\u4e3b\u8981\u8bc4\u4f30\u793e\u4f1a\u8bed\u5883\u4e2dAI\u7cfb\u7edf\u7684SafetyBench\u548cBBQ\u7b49\u5148\u524d\u7684\u57fa\u51c6\u4e0d\u540c\uff0cSciMT-Safety\u4e13\u95e8\u8bbe\u8ba1\u7528\u4e8e\u8bc4\u4f30\u79d1\u5b66\u8bed\u5883\u4e2dAI\u7cfb\u7edf\u6ee5\u7528\u7684\u98ce\u9669\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u25cf \u56e2\u961f\u6d4b\u8bd5\u4e86GPT-4\u3001GPT-3.5\u3001Claude-2\u3001Llama\u7cfb\u5217\u3001PaLM-2\u3001Vicuna\u7cfb\u5217\u3001Mistral\u7cfb\u5217\u4ee5\u53caChemCrow agent\u3002\u5728\u8be5\u6570\u636e\u96c6\u4e0a\uff0cSciGuard\u53d6\u5f97\u4e86\u6700\u597d\u7684\u6548\u679c\u3002<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 88%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012882\" data-ratio=\"0.5854700854700855\" data-s=\"300,640\"  data-type=\"png\" data-w=\"936\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-262e088f8295df0ed7fb4bc7ee5a9260.png\"  \/><\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u4e3a\u4f55\u91cd\u8981\uff1aAI\u964d\u4f4eCBRN\u7684\u6ee5\u7528\u95e8\u69db\u662f\u5371\u9669\u80fd\u529b\u8bc4\u6d4b\u7684\u91cd\u8981\u90e8\u5206<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf AI \u5728\u964d\u4f4e\u5316\u5b66\u3001\u751f\u7269\u3001\u8f90\u5c04\u548c\u6838\uff08CBRN\uff09\u9886\u57df\u6ee5\u7528\u7684\u95e8\u69db\u65b9\u9762\u7684\u4f5c\u7528\u662f\u5371\u9669\u80fd\u529b\u8bc4\u6d4b\u7684\u91cd\u8981\u90e8\u5206\u3002\u7531\u4e8e AI \u6a21\u578b\u5728\u79d1\u5b66\u9886\u57df\u7684\u6ee5\u7528\u53ef\u80fd\u653e\u5927\u8bf8\u5982\u521b\u9020\u6709\u5bb3\u7269\u8d28\u6216\u89c4\u907f\u73b0\u6709\u89c4\u5b9a\u7b49\u98ce\u9669\uff0c\u8fd9\u9879\u7814\u7a76\u5f3a\u8c03\u4e86\u5236\u5b9a\u6709\u6548\u7684\u98ce\u9669\u7ba1\u7406\u7b56\u7565\u7684\u91cd\u8981\u6027\uff0c\u5e76\u547c\u5401\u5728\u79d1\u5b66\u9886\u57df\u8d1f\u8d23\u4efb\u5730\u5f00\u53d1\u548c\u4f7f\u7528 AI\u3002<\/span><\/p>\n<p><br  \/><\/p>\n<p><strong>\u4e3b\u8981\u53c2\u8003\u6587\u732e\uff1a<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf He, Jiyan, et al., Control Risk for Potential Misuse of Artificial Intelligence in Science, arXiv preprint arXiv:2312.06632 (2023\/12).<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;PART 5<\/strong><\/p>\n<p><strong>\u884c\u52a8\u6307\u5357<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;margin-top: 3px;\" powered-by=\"xiumi.us\">\n<section style=\"background-color: rgb(217, 217, 217);height: 1px;\"><svg viewbox=\"0 0 1 1\" style=\"float:left;line-height:0;width:0;vertical-align:top;\"><\/svg><\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;padding: 20px 10px;height: auto;background-color: rgba(21, 59, 179, 0.09);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: middle;width: auto;align-self: center;flex: 0 0 auto;min-width: 5%;height: auto;background-color: rgb(21, 59, 179);padding-right: 8px;padding-left: 8px;border-style: solid;border-width: 5px;border-color: rgb(237, 237, 237);\">\n<section style=\"text-align: justify;font-size: 26px;color: rgb(255, 255, 255);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>01<\/strong><\/p>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: bottom;width: auto;flex: 100 100 0%;height: auto;align-self: flex-end;border-style: solid;border-width: 0px;border-color: rgb(212, 32, 2) rgb(212, 32, 2) rgb(212, 32, 2) rgb(21, 59, 179);padding-left: 7px;\">\n<section style=\"text-align: justify;font-size: 17px;color: rgb(62, 62, 62);\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u56fd\u5bb6\u81ea\u7136\u79d1\u5b66\u57fa\u91d1\u59d4\u5458\u4f1a\u201c\u751f\u6210\u5f0f\u4eba\u5de5<\/strong><strong>\u667a\u80fd\u57fa\u7840\u7814\u7a76\u201d\u4e13\u9879\u9879\u76ee<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<section style=\"color: rgb(62, 62, 62);padding-right: 20px;padding-left: 20px;font-family: PingFangSC-light;letter-spacing: 1px;line-height: 1.7;\" powered-by=\"xiumi.us\">\n<p><strong>\u76f8\u5173\u673a\u4f1a\u6458\u5f55\uff1a\uff08\u56db\uff09\u5927\u6a21\u578b\u7684\u4ef7\u503c\u89c2\u548c\u5b89\u5168\u5bf9\u9f50\u7b56\u7565\u7814\u7a76\u3002<\/strong><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u201c\u7814\u7a76\u7b26\u5408\u4eba\u7c7b\u4ef7\u503c\u89c2\u504f\u597d\u7684\u53ef\u6301\u7eed\u3001\u9ad8\u6cdb\u5316\u3001\u5f3a\u5bf9\u6297\u7684\u5927\u6a21\u578b\u5bf9\u9f50\u6280\u672f\uff0c\u5b9e\u73b0\u5b89\u5168\u4f26\u7406\u548c\u8ba4\u77e5\u63a8\u7406\u80fd\u529b\u7684\u5bf9\u9f50\u548c\u8d85\u8d8a\uff0c\u63d0\u5347\u5927\u6a21\u578b\u5e94\u7528\u7684\u65e0\u5bb3\u6027\u548c\u6709\u6548\u6027\uff0c\u589e\u5f3a\u5927\u6a21\u578b\u5b89\u5168\u4f26\u7406\u4ef7\u503c\u89c2\u5bf9\u9f50\u7b97\u6cd5\u7684\u8bad\u7ec3\u9ad8\u7a33\u5b9a\u6027\u548c\u5e94\u7528\u53ef\u9760\u6027\u3002\u201d<\/span><\/p>\n<p><span style=\"font-size: 14px;\">\u25cf \u8be6\u60c5\u53ef\u53c2\u8003\u6b64\u63a8\u9001\u6587\u7ae0<\/span><a href=\"https:\/\/mp.weixin.qq.com\/s?__biz=MzA4NDUwMjMxNA==&amp;mid=2650311693&amp;idx=1&amp;sn=64a7cba8e67d69135075aaa8fa7e0b72&amp;scene=21#wechat_redirect\" target=\"_blank\" data-linktype=\"2\" style=\"text-decoration: underline;color: rgb(98, 132, 162);font-size: 14px;\" rel=\"noopener noreferrer\"><span style=\"text-decoration: underline;color: rgb(98, 132, 162);font-size: 14px;\">\u300a\u8bfe\u9898\u7533\u62a5 | \u201c\u751f\u6210\u5f0f\u4eba\u5de5\u667a\u80fd\u57fa\u7840\u7814\u7a76\u201d\u4e13\u9879\u9879\u76ee\u7533\u8bf7\u6307\u5357\u300b<\/span><\/a><span style=\"font-size: 14px;\">\uff0c\u6216\u56fd\u5bb6\u81ea\u7136\u79d1\u5b66\u57fa\u91d1\u59d4\u5458\u4f1a\u7f51\u7ad9\u3002<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><br  \/><\/p>\n<section style=\"text-align: center;font-size: 24px;color: rgba(21, 59, 179, 0.41);\" powered-by=\"xiumi.us\">\n<p><strong>\u2014 CONCORDIA AI \u2014<\/strong><\/p>\n<\/section>\n<p style=\"text-wrap: wrap;\" powered-by=\"xiumi.us\"><span style=\"letter-spacing: 0.034em;\"><\/span><br  \/><\/p>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;margin-bottom: 10px;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: auto;align-self: flex-start;flex: 0 0 auto;min-width: 5%;height: auto;\">\n<section style=\"transform: perspective(0px);transform-style: flat;\" powered-by=\"xiumi.us\">\n<section style=\"transform: rotateX(180deg);\">\n<section style=\"display: inline-block;width: 19px;height: 19px;vertical-align: top;overflow: hidden;border-style: solid;border-width: 2px;border-color: rgb(21, 59, 179);border-radius: 155px;\">\n<section style=\"text-align: justify;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><br  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: top;width: auto;min-width: 5%;flex: 0 0 auto;height: auto;\">\n<section style=\"\" powered-by=\"xiumi.us\">\n<section style=\"font-size: 21px;color: rgb(21, 59, 179);\">\n<p><strong>&nbsp;\u901a\u8baf\u4f5c\u8005<\/strong><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: left;justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: auto;vertical-align: top;align-self: flex-start;flex: 100 100 0%;border-style: solid;border-width: 0px 0px 0px 15px;border-left-color: rgb(21, 59, 179);margin-left: 4px;height: auto;\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;width: 100%;vertical-align: top;align-self: flex-start;flex: 0 0 auto;background-color: rgb(255, 255, 255);padding: 8px 22px;border-bottom: 5px solid rgb(21, 59, 179);\">\n<section style=\"justify-content: flex-start;display: flex;flex-flow: row;\" powered-by=\"xiumi.us\">\n<section style=\"display: inline-block;vertical-align: top;width: 29%;align-self: flex-start;flex: 0 0 auto;height: auto;\">\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\">\n<section style=\"vertical-align: middle;display: inline-block;line-height: 0;width: 100%;height: auto;\"><img class=\"rich_pages wxw-img\" data-imgfileid=\"100012883\" data-ratio=\"1\" data-s=\"300,640\"  data-type=\"png\" data-w=\"268\" style=\"vertical-align: middle;width: 100%;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-47ff8e327ca1c4637111fca15bca1f6a.png\"  \/><\/section>\n<\/section>\n<\/section>\n<section style=\"display: inline-block;vertical-align: middle;width: 65%;align-self: center;flex: 0 0 auto;height: auto;\">\n<section style=\"text-align: justify;padding-right: 20px;padding-left: 20px;\" powered-by=\"xiumi.us\">\n<p style=\"text-wrap: wrap;\"><strong>\u6bb5\u96c5\u6587<\/strong><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u6280\u672f\u9879\u76ee\u7ecf\u7406<\/span><\/p>\n<p style=\"text-wrap: wrap;\"><span style=\"font-size: 14px;\">\u4e13\u6ce8 AI \u5bf9\u9f50\u4e0e\u5b89\u5168\u7814\u7a76<\/span><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"text-align: center;margin-top: 10px;margin-bottom: 10px;line-height: 0;\" powered-by=\"xiumi.us\"><span style=\"color: rgb(255, 255, 255);font-family: mp-quote, -apple-system-font, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 14px;letter-spacing: 0.034em;text-align: justify;\"><\/span><\/section>\n<section style=\"margin-bottom: 5px;color: rgb(136, 136, 136);font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;outline: 0px;\"><br  \/><\/section>\n<section style=\"margin-bottom: 5px;color: rgb(136, 136, 136);font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;outline: 0px;\"><br  \/><\/section>\n<section powered-by=\"xiumi.us\" style=\"margin-top: 10px;margin-bottom: 0px;color: rgb(136, 136, 136);font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;text-size-adjust: auto;text-align: center;justify-content: center;\">\n<section style=\"outline: 0px;display: inline-block;width: auto;vertical-align: top;min-width: 10%;height: auto;box-shadow: rgb(0, 0, 0) 0px 0px 0px;border-bottom: 9px solid rgb(169, 215, 227);border-bottom-right-radius: 0px;\">\n<section powered-by=\"xiumi.us\" style=\"margin-bottom: -15px;outline: 0px;\">\n<section style=\"padding-right: 10px;padding-left: 10px;outline: 0px;color: rgb(12, 130, 169);font-size: 16px;line-height: 2;letter-spacing: 3px;\">\n<p style=\"outline: 0px;\"><strong style=\"outline: 0px;\">AI\u5b89\u5168\u4e0e\u5bf9\u9f50\u8bfb\u4e66\u4f1a\u542f\u52a8<\/strong><br style=\"outline: 0px;\"  \/><\/p>\n<\/section>\n<\/section>\n<\/section>\n<\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 1.75em;text-align: center;\"><br style=\"outline: 0px;\"  \/><\/section>\n<p style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;text-wrap: wrap;background-color: rgb(255, 255, 255);letter-spacing: 0.544px;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 2em;\"><br  \/><\/p>\n<p style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;text-wrap: wrap;background-color: rgb(255, 255, 255);letter-spacing: 0.544px;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 2em;\"><span style=\"font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 15px;letter-spacing: 0.578px;\">\u5927\u6a21\u578b\u7684\u72c2\u98d9\u7a81\u8fdb\u5524\u9192\u4e86\u4eba\u4eec\u5bf9AI\u6280\u672f\u7684\u70ed\u60c5\u548c\u61a7\u61ac\uff0c\u4e5f\u5f15\u53d1\u4e86\u5bf9AI\u6280\u672f\u672c\u8eab\u5b58\u5728\u7684\u793e\u4f1a\u4f26\u7406\u98ce\u9669\u53ca\u5176\u5bf9\u4eba\u7c7b\u751f\u5b58\u6784\u6210\u7684\u6f5c\u5728\u5a01\u80c1\u7684\u666e\u904d\u62c5\u5fe7\u3002<\/span><span style=\"font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 15px;letter-spacing: 0.578px;\">\u5c24\u5176\u5728\u590d\u6742\u7cfb\u7edf\u89c6\u89d2\u4e0b\uff0cAI\u7cfb\u7edf\u6b63\u5728\u5c55\u73b0\u51fa\u975e\u7ebf\u6027\u7684\u3001\u8fdc\u8d85\u9884\u671f\u7684\u6d8c\u73b0\u80fd\u529b\uff0c\u8fd9\u662fAI\u6280\u672f\u7684\u65b0\u673a\u9047\uff0c\u4e5f\u662fAI\u5b89\u5168\u7684\u65b0\u6311\u6218\u3002<\/span><span style=\"font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 15px;letter-spacing: 0.578px;\">\u5728\u6b64\u80cc\u666f\u4e0b\uff0cAI\u5b89\u5168\u4e0e\u5bf9\u9f50\u5f97\u5230\u5e7f\u6cdb\u5173\u6ce8\uff0c\u8fd9\u662f\u4e00\u4e2a\u81f4\u529b\u4e8e\u8ba9AI\u9020\u798f\u4eba\u7c7b\uff0c\u907f\u514dAI\u6a21\u578b\u5931\u63a7\u6216\u88ab\u6ee5\u7528\u800c\u5bfc\u81f4\u707e\u96be\u6027\u540e\u679c\u7684\u7814\u7a76\u65b9\u5411\u3002<\/span><\/p>\n<p style=\"margin: 3px 8px 0px;color: rgb(136, 136, 136);font-size: 14px;text-wrap: wrap;letter-spacing: 0.544px;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 2em;background-color: rgb(255, 255, 255);\"><span style=\"letter-spacing: 0.578px;outline: 0px;font-size: 15px;\"><br  \/><\/span><\/p>\n<p style=\"margin: 3px 8px 0px;color: rgb(136, 136, 136);font-size: 14px;text-wrap: wrap;letter-spacing: 0.544px;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 2em;background-color: rgb(255, 255, 255);\"><span style=\"letter-spacing: 0.578px;outline: 0px;font-size: 15px;\">\u96c6\u667a\u4ff1\u4e50\u90e8\u548c\u5b89\u8fdcAI\u8054\u5408\u4e3e\u529e<a target=\"_blank\" href=\"http:\/\/mp.weixin.qq.com\/s?__biz=MzIzMjQyNzQ5MA==&amp;mid=2247679496&amp;idx=1&amp;sn=2f889356c9fcf6fb460dcd87f05036af&amp;chksm=e8996485dfeeed937ca8c59e909b6c991d2ff1cbd132117a91bf54acf9e6205bd0ab2a9e4fb2&amp;scene=21#wechat_redirect\" textvalue=\"\u300cAI\u5b89\u5168\u4e0e\u5bf9\u9f50\u300d\u8bfb\u4e66\u4f1a\" linktype=\"text\" imgurl=\"\" imgdata=\"null\" data-itemshowtype=\"0\" tab=\"innerlink\" data-linktype=\"2\" style=\"letter-spacing: 0.578px;\" rel=\"noopener noreferrer\">\u300cAI\u5b89\u5168\u4e0e\u5bf9\u9f50\u300d\u8bfb\u4e66\u4f1a<\/a>\uff0c\u7531\u591a\u4f4d\u6d77\u5185\u5916\u4e00\u7ebf\u7814\u7a76\u8005\u8054\u5408\u53d1\u8d77\uff0c\u65e8\u5728\u6df1\u5165\u63a2\u8ba8AI\u5b89\u5168\u4e0e\u5bf9\u9f50\u6240\u6d89\u53ca\u7684\u6838\u5fc3\u6280\u672f\u3001\u7406\u8bba\u67b6\u6784\u3001\u89e3\u51b3\u8def\u5f84\u4ee5\u53ca\u5b89\u5168\u6cbb\u7406\u7b49\u4ea4\u53c9\u8bfe\u9898\uff0c\u5c55\u5f00\u5171\u8bfb\u5171\u7814\u6d3b\u52a8\u3002\u8bfb\u4e66\u4f1a\u81ea2024\u5e741\u670820\u65e5\u5f00\u59cb\uff0c\u6bcf\u5468\u516d\u4e0a\u5348\u4e3e\u884c\uff0c\u4e3a\u671f8-10\u5468\u3002\u6b22\u8fce\u4ece\u4e8b\u76f8\u5173\u7814\u7a76\u4e0e\u5e94\u7528\u5de5\u4f5c\u7684\u670b\u53cb\u62a5\u540d\u52a0\u5165\uff01<\/span><\/p>\n<p style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;text-size-adjust: auto;line-height: 2em;\"><span style=\"letter-spacing: 0.578px;background-color: rgb(255, 255, 255);outline: 0px;font-size: 15px;\"><\/span><\/p>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;font-size: 15px;color: rgb(63, 63, 63);\"><strong style=\"outline: 0px;\"><br style=\"outline: 0px;\"  \/><\/strong><\/span><\/section>\n<section style=\"margin-bottom: 0px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;text-align: center;margin-left: 8px;margin-right: 8px;\"><a target=\"_blank\" href=\"http:\/\/mp.weixin.qq.com\/s?__biz=MzIzMjQyNzQ5MA==&amp;mid=2247679496&amp;idx=1&amp;sn=2f889356c9fcf6fb460dcd87f05036af&amp;chksm=e8996485dfeeed937ca8c59e909b6c991d2ff1cbd132117a91bf54acf9e6205bd0ab2a9e4fb2&amp;scene=21#wechat_redirect\" textvalue=\"\u4f60\u5df2\u9009\u4e2d\u4e86\u6dfb\u52a0\u94fe\u63a5\u7684\u5185\u5bb9\" linktype=\"text\" imgurl=\"\" imgdata=\"null\" data-itemshowtype=\"0\" tab=\"innerlink\" data-linktype=\"1\" hasload=\"1\" rel=\"noopener noreferrer\"><span class=\"js_jump_icon h5_image_link\" style=\"outline: 0px;vertical-align: bottom;user-select: none;width: 578px;\"><img class=\"rich_pages wxw-img\" data-backh=\"321\" data-backw=\"562\" data-cropselx1=\"0\" data-cropselx2=\"562\" data-cropsely1=\"0\" data-cropsely2=\"282\" data-imgfileid=\"100196202\" data-ratio=\"0.5714285714285714\"  data-type=\"jpeg\" data-w=\"1050\" style=\"outline: 0px;border-width: 0px;border-style: initial;border-color: initial;width: 578px;visibility: visible !important;\" src=\"\/wp-content\/uploads\/2024\/01\/wxsync-2024-01-f427e272d561a1d6b8a967d2d485d8f8.png\"  \/><\/span><\/a><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><br style=\"outline: 0px;\"  \/><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;\"><span style=\"font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;letter-spacing: 0.544px;\">\u8be6\u60c5\u8bf7\u89c1\uff1a<\/span><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><span style=\"font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;outline: 0px;color: rgb(63, 63, 63);font-size: 15px;letter-spacing: 0.578px;text-decoration: underline;\"><a target=\"_blank\" href=\"http:\/\/mp.weixin.qq.com\/s?__biz=MzIzMjQyNzQ5MA==&amp;mid=2247679496&amp;idx=1&amp;sn=2f889356c9fcf6fb460dcd87f05036af&amp;chksm=e8996485dfeeed937ca8c59e909b6c991d2ff1cbd132117a91bf54acf9e6205bd0ab2a9e4fb2&amp;scene=21#wechat_redirect\" textvalue=\"2024\u5f00\u5e74\u8bfb\u4e66\u4f1a\uff1aAI\u5b89\u5168\u4e0e\u5bf9\u9f50\u2014\u2014\u5e94\u5bf9\u524d\u6cbfAI\u5931\u63a7\u4e0e\u6ee5\u7528\u7684\u6280\u672f\u8def\u7ebf\" linktype=\"text\" imgurl=\"\" imgdata=\"null\" data-itemshowtype=\"0\" tab=\"innerlink\" data-linktype=\"2\" rel=\"noopener noreferrer\">2024\u5f00\u5e74\u8bfb\u4e66\u4f1a\uff1aAI\u5b89\u5168\u4e0e\u5bf9\u9f50\u2014\u2014\u5e94\u5bf9\u524d\u6cbfAI\u5931\u63a7\u4e0e\u6ee5\u7528\u7684\u6280\u672f\u8def\u7ebf<\/a><\/span><span style=\"outline: 0px;color: rgb(63, 63, 63);font-family: -apple-system-font, system-ui, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;font-size: 15px;letter-spacing: 0.544px;\"><br  \/><\/span><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><br  \/><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><br  \/><\/section>\n<section style=\"margin-right: 8px;margin-bottom: 0px;margin-left: 8px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;\"><br  \/><\/section>\n<p style=\"margin-bottom: 0px;color: rgb(136, 136, 136);font-size: 14px;letter-spacing: 0.544px;text-wrap: wrap;background-color: rgb(255, 255, 255);outline: 0px;font-family: system-ui, -apple-system, BlinkMacSystemFont, &quot;Helvetica Neue&quot;, &quot;PingFang SC&quot;, &quot;Hiragino Sans GB&quot;, &quot;Microsoft YaHei UI&quot;, &quot;Microsoft YaHei&quot;, Arial, sans-serif;line-height: 1.75em;margin-left: 0px;margin-right: 0px;\"><strong style=\"letter-spacing: 0.544px;text-wrap: wrap;outline: 0px;font-size: 15px;text-align: left;color: rgb(255, 255, 255);font-family: PingFangSC-light;\"><span style=\"outline: 0px;background-color: rgb(12, 130, 169);\">\u70b9\u51fb\u201c\u9605\u8bfb\u539f\u6587\u201d\uff0c\u62a5\u540d\u8bfb\u4e66\u4f1a<\/span><\/strong><\/p>\n<\/section>\n<p style=\"display: none;\"><mp-style-type data-value=\"3\"><\/mp-style-type><\/p>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>AI\u5b89\u5168\u56db\u5927\u6293\u624b\uff1a\u5bf9\u9f50\u3001\u9c81\u68d2\u6027\u3001\u76d1\u6d4b\u3001\u7cfb\u7edf\u5b89\u5168\u6027 &#8211;&nbsp;\u6765\u81ea\u300aAI\u5b89\u5168\u524d\u6cbf #1\u300b &nbsp;\u672c\u671f\u8981\u95fb\u76ee\u5f55\uff08\u4e0a\u4e0b\u6ed1\u52a8\u67e5\u770b\uff09 Part 1 \u5bf9\u9f50\uff08Alignment\uff09 1. \u5982\u4f55\u7406\u89e3\u5927\u6a21\u578b\u7684\u201c\u8c04\u5a9a\u201d\uff08Sycophancy\uff09\u73b0\u8c61\uff1f 2. \u9488\u5bf9\u5927\u6a21\u578b\u6001\u52bf\u611f\u77e5\u80fd\u529b\uff08Situationa&#8230;<\/p>\n","protected":false},"author":0,"featured_media":47393,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[1],"tags":[],"special":[],"_links":{"self":[{"href":"https:\/\/swarma.org\/index.php?rest_route=\/wp\/v2\/posts\/47416"}],"collection":[{"href":"https:\/\/swarma.org\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/swarma.org\/index.php?rest_route=\/wp\/v2\/types\/post"}],"replies":[{"embeddable":true,"href":"https:\/\/swarma.org\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=47416"}],"version-history":[{"count":0,"href":"https:\/\/swarma.org\/index.php?rest_route=\/wp\/v2\/posts\/47416\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/swarma.org\/index.php?rest_route=\/wp\/v2\/media\/47393"}],"wp:attachment":[{"href":"https:\/\/swarma.org\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=47416"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/swarma.org\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=47416"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/swarma.org\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=47416"},{"taxonomy":"special","embeddable":true,"href":"https:\/\/swarma.org\/index.php?rest_route=%2Fwp%2Fv2%2Fspecial&post=47416"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}