26:["$","$L2f",null,{"data":{"isPreview":true,"seq":1315546,"episode":{"Id":"89c18baf315660cbc1611095e737d4876b4e9a93cb951d3d97dc9f1df3b9bcb7","Seq":1315546,"PodId":"2d4f05aac4d528e40e954452346d5021f1ab57bc06cca986eb689df8c034ea98","PodSeq":9417,"Title":"[QA] Poser: Unmasking Alignment Faking LLMs by Manipulating Their Internals","PodName":"Arxiv Papers","Description":"

Large Language Models (LLMs) can deceive as 'alignment fakers.' A benchmark with 324 LLM pairs is introduced to detect misbehaving models, achieving 98% accuracy with a specific strategy.

https://arxiv.org/abs//2405.05466

YouTube: https://www.youtube.com/@ArxivPapers

TikTok: https://www.tiktok.com/@arxiv_papers

Apple Podcasts: https://podcasts.apple.com/us/podcast/arxiv-papers/id1692476016

Spotify: https://podcasters.spotify.com/pod/show/arxiv-papers

\n\n--- \n\nSupport this podcast: https://podcasters.spotify.com/pod/show/arxiv-papers/support","Url":"https://podcasters.spotify.com/pod/show/arxiv-papers/episodes/QA-Poser-Unmasking-Alignment-Faking-LLMs-by-Manipulating-Their-Internals-e2jgc78","Link":"https://anchor.fm/s/e3d903bc/podcast/play/86568616/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2024-4-10%2Ffdeecc6c-51c9-d7c6-9f79-7b7abf926288.mp3","LinkType":"mp3","PublishTime":"$D2024-05-10T13:33:33.000Z","Img":"https://is1-ssl.mzstatic.com/image/thumb/Podcasts126/v4/cd/4e/42/cd4e4257-8d65-f81d-a18c-04f722f5e42a/mza_1849965358261212239.jpg/600x600bb.jpg","EpImg":"https://d3t3ozftmdmh3i.cloudfront.net/staging/podcast_uploaded_nologo/38126503/38126503-1686696647986-c72655f5033ff.jpg","Duration":"00:09:53","Language":null,"SampleDuration":null,"IsVBR":false,"Transcribed":false,"Indexed":1,"Deleted":false,"RedirectSeq":null,"Source":null,"Size":null},"prevAndNext":{"prevSeq":1315541,"nextSeq":1315548},"playback":null,"states":{"state":"not-login","extra":{"summary":"Arxiv Papers - [QA] Poser: Unmasking Alignment Faking LLMs by Manipulating Their Internals","previewContent":{"summary":"Arxiv Papers - [QA] Poser: Unmasking Alignment Faking LLMs by Manipulating Their Internals","chapters":[],"keywords":[],"highlights":[],"transcripts":[]}}}}}]