paulaoak commited on
Commit
8932df5
·
verified ·
1 Parent(s): 90eb0d1

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +368 -274
index.html CHANGED
@@ -3,28 +3,49 @@
3
  <head>
4
  <meta charset="utf-8">
5
  <meta name="description"
6
- content="Deformable Neural Radiance Fields creates free-viewpoint portraits (nerfies) from casually captured videos.">
7
- <meta name="keywords" content="Nerfies, D-NeRF, NeRF">
8
  <meta name="viewport" content="width=device-width, initial-scale=1">
9
- <title>Nerfies: Deformable Neural Radiance Fields</title>
10
 
11
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
12
  rel="stylesheet">
13
 
14
- <link rel="stylesheet" href="./static/css/bulma.min.css">
15
- <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
16
- <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
17
- <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
18
  <link rel="stylesheet"
19
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
20
- <link rel="stylesheet" href="./static/css/index.css">
21
- <link rel="icon" href="./static/images/favicon.svg">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
24
- <script defer src="./static/js/fontawesome.all.min.js"></script>
25
- <script src="./static/js/bulma-carousel.min.js"></script>
26
- <script src="./static/js/bulma-slider.min.js"></script>
27
- <script src="./static/js/index.js"></script>
28
  </head>
29
  <body>
30
 
@@ -33,39 +54,23 @@
33
  <div class="container is-max-desktop">
34
  <div class="columns is-centered">
35
  <div class="column has-text-centered">
36
- <h1 class="title is-1 publication-title">Nerfies: Deformable Neural Radiance Fields</h1>
37
  <div class="is-size-5 publication-authors">
38
  <span class="author-block">
39
- <a href="https://keunhong.com" target="_blank">Keunhong Park</a><sup>1</sup>,</span>
40
- <span class="author-block">
41
- <a href="https://utkarshsinha.com" target="_blank">Utkarsh Sinha</a><sup>2</sup>,</span>
42
- <span class="author-block">
43
- <a href="https://jonbarron.info" target="_blank">Jonathan T. Barron</a><sup>2</sup>,
44
- </span>
45
- <span class="author-block">
46
- <a href="http://sofienbouaziz.com" target="_blank">Sofien Bouaziz</a><sup>2</sup>,
47
- </span>
48
  <span class="author-block">
49
- <a href="https://www.danbgoldman.com" target="_blank">Dan B Goldman</a><sup>2</sup>,
50
- </span>
51
- <span class="author-block">
52
- <a href="https://homes.cs.washington.edu/~seitz/" target="_blank">Steven M. Seitz</a><sup>1,2</sup>,
53
- </span>
54
- <span class="author-block">
55
- <a href="http://www.ricardomartinbrualla.com" target="_blank">Ricardo Martin-Brualla</a><sup>2</sup>
56
- </span>
57
  </div>
58
 
59
  <div class="is-size-5 publication-authors">
60
- <span class="author-block"><sup>1</sup>University of Washington,</span>
61
- <span class="author-block"><sup>2</sup>Google Research</span>
62
  </div>
63
 
64
  <div class="column has-text-centered">
65
  <div class="publication-links">
66
  <!-- PDF Link. -->
67
  <span class="link-block">
68
- <a href="https://arxiv.org/pdf/2011.12948" target="_blank"
69
  class="external-link button is-normal is-rounded is-dark">
70
  <span class="icon">
71
  <i class="fas fa-file-pdf"></i>
@@ -74,7 +79,7 @@
74
  </a>
75
  </span>
76
  <span class="link-block">
77
- <a href="https://arxiv.org/abs/2011.12948" target="_blank"
78
  class="external-link button is-normal is-rounded is-dark">
79
  <span class="icon">
80
  <i class="ai ai-arxiv"></i>
@@ -82,19 +87,9 @@
82
  <span>arXiv</span>
83
  </a>
84
  </span>
85
- <!-- Video Link. -->
86
- <span class="link-block">
87
- <a href="https://www.youtube.com/watch?v=MrKrnHhk8IA" target="_blank"
88
- class="external-link button is-normal is-rounded is-dark">
89
- <span class="icon">
90
- <i class="fab fa-youtube"></i>
91
- </span>
92
- <span>Video</span>
93
- </a>
94
- </span>
95
  <!-- Code Link. -->
96
  <span class="link-block">
97
- <a href="https://github.com/google/nerfies" target="_blank"
98
  class="external-link button is-normal is-rounded is-dark">
99
  <span class="icon">
100
  <i class="fab fa-github"></i>
@@ -102,17 +97,10 @@
102
  <span>Code</span>
103
  </a>
104
  </span>
105
- <!-- Dataset Link. -->
106
- <span class="link-block">
107
- <a href="https://github.com/google/nerfies/releases/tag/0.1" target="_blank"
108
- class="external-link button is-normal is-rounded is-dark">
109
- <span class="icon">
110
- <i class="far fa-images"></i>
111
- </span>
112
- <span>Data</span>
113
- </a>
114
  </div>
115
-
 
 
116
  </div>
117
  </div>
118
  </div>
@@ -120,81 +108,40 @@
120
  </div>
121
  </section>
122
 
123
- <section class="hero teaser">
124
- <div class="container is-max-desktop">
125
- <div class="hero-body">
126
- <video id="teaser" autoplay muted loop playsinline height="100%">
127
- <source src="./static/videos/teaser.mp4"
128
- type="video/mp4">
129
- </video>
130
- <h2 class="subtitle has-text-centered">
131
- <span class="dnerf">Nerfies</span> turns selfie videos from your phone into
132
- free-viewpoint
133
- portraits.
134
- </h2>
135
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
136
  </div>
137
  </section>
138
-
139
-
140
- <section class="hero is-light is-small">
141
- <div class="hero-body">
142
- <div class="container">
143
- <div id="results-carousel" class="carousel results-carousel">
144
- <div class="item item-steve">
145
- <video poster="" id="steve" autoplay controls muted loop playsinline height="100%">
146
- <source src="./static/videos/steve.mp4"
147
- type="video/mp4">
148
- </video>
149
- </div>
150
- <div class="item item-chair-tp">
151
- <video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
152
- <source src="./static/videos/chair-tp.mp4"
153
- type="video/mp4">
154
- </video>
155
- </div>
156
- <div class="item item-shiba">
157
- <video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
158
- <source src="./static/videos/shiba.mp4"
159
- type="video/mp4">
160
- </video>
161
- </div>
162
- <div class="item item-fullbody">
163
- <video poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
164
- <source src="./static/videos/fullbody.mp4"
165
- type="video/mp4">
166
- </video>
167
- </div>
168
- <div class="item item-blueshirt">
169
- <video poster="" id="blueshirt" autoplay controls muted loop playsinline height="100%">
170
- <source src="./static/videos/blueshirt.mp4"
171
- type="video/mp4">
172
- </video>
173
- </div>
174
- <div class="item item-mask">
175
- <video poster="" id="mask" autoplay controls muted loop playsinline height="100%">
176
- <source src="./static/videos/mask.mp4"
177
- type="video/mp4">
178
- </video>
179
- </div>
180
- <div class="item item-coffee">
181
- <video poster="" id="coffee" autoplay controls muted loop playsinline height="100%">
182
- <source src="./static/videos/coffee.mp4"
183
- type="video/mp4">
184
- </video>
185
- </div>
186
- <div class="item item-toby">
187
- <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
188
- <source src="./static/videos/toby2.mp4"
189
- type="video/mp4">
190
- </video>
191
- </div>
192
  </div>
193
- </div>
194
  </div>
195
  </section>
196
-
197
-
198
  <section class="section">
199
  <div class="container is-max-desktop">
200
  <!-- Abstract. -->
@@ -203,210 +150,364 @@
203
  <h2 class="title is-3">Abstract</h2>
204
  <div class="content has-text-justified">
205
  <p>
206
- We present the first method capable of photorealistically reconstructing a non-rigidly
207
- deforming scene using photos/videos captured casually from mobile phones.
 
208
  </p>
209
  <p>
210
- Our approach augments neural radiance fields
211
- (NeRF) by optimizing an
212
- additional continuous volumetric deformation field that warps each observed point into a
213
- canonical 5D NeRF.
214
- We observe that these NeRF-like deformation fields are prone to local minima, and
215
- propose a coarse-to-fine optimization method for coordinate-based models that allows for
216
- more robust optimization.
217
- By adapting principles from geometry processing and physical simulation to NeRF-like
218
- models, we propose an elastic regularization of the deformation field that further
219
- improves robustness.
220
  </p>
221
  <p>
222
- We show that <span class="dnerf">Nerfies</span> can turn casually captured selfie
223
- photos/videos into deformable NeRF
224
- models that allow for photorealistic renderings of the subject from arbitrary
225
- viewpoints, which we dub <i>"nerfies"</i>. We evaluate our method by collecting data
226
- using a
227
- rig with two mobile phones that take time-synchronized photos, yielding train/validation
228
- images of the same pose at different viewpoints. We show that our method faithfully
229
- reconstructs non-rigidly deforming scenes and reproduces unseen views with high
230
- fidelity.
231
  </p>
232
  </div>
233
  </div>
234
  </div>
235
  <!--/ Abstract. -->
236
-
237
- <!-- Paper video. -->
238
- <div class="columns is-centered has-text-centered">
239
- <div class="column is-four-fifths">
240
- <h2 class="title is-3">Video</h2>
241
- <div class="publication-video">
242
- <iframe src="https://www.youtube.com/embed/MrKrnHhk8IA?rel=0&amp;showinfo=0"
243
- frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
244
- </div>
245
- </div>
246
- </div>
247
- <!--/ Paper video. -->
248
  </div>
249
  </section>
250
 
251
 
252
  <section class="section">
253
  <div class="container is-max-desktop">
254
-
255
  <div class="columns is-centered">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- <!-- Visual Effects. -->
258
- <div class="column">
259
- <div class="content">
260
- <h2 class="title is-3">Visual Effects</h2>
261
- <p>
262
- Using <i>nerfies</i> you can create fun visual effects. This Dolly zoom effect
263
- would be impossible without nerfies since it would require going through a wall.
264
- </p>
265
- <video id="dollyzoom" autoplay controls muted loop playsinline height="100%">
266
- <source src="./static/videos/dollyzoom-stacked.mp4"
267
- type="video/mp4">
268
- </video>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  </div>
270
- </div>
271
- <!--/ Visual Effects. -->
272
-
273
- <!-- Matting. -->
274
- <div class="column">
275
- <h2 class="title is-3">Matting</h2>
276
- <div class="columns is-centered">
277
- <div class="column content">
278
- <p>
279
- As a byproduct of our method, we can also solve the matting problem by ignoring
280
- samples that fall outside of a bounding box during rendering.
281
- </p>
282
- <video id="matting-video" controls playsinline height="100%">
283
- <source src="./static/videos/matting.mp4"
284
- type="video/mp4">
285
- </video>
286
- </div>
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  </div>
289
  </div>
290
  </div>
291
- <!--/ Matting. -->
 
292
 
293
- <!-- Animation. -->
 
294
  <div class="columns is-centered">
295
  <div class="column is-full-width">
296
- <h2 class="title is-3">Animation</h2>
297
-
298
- <!-- Interpolating. -->
299
- <h3 class="title is-4">Interpolating states</h3>
300
  <div class="content has-text-justified">
301
  <p>
302
- We can also animate the scene by interpolating the deformation latent codes of two input
303
- frames. Use the slider here to linearly interpolate between the left frame and the right
304
- frame.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  </p>
306
- </div>
307
- <div class="columns is-vcentered interpolation-panel">
308
- <div class="column is-3 has-text-centered">
309
- <img src="./static/images/interpolate_start.jpg"
310
- class="interpolation-image"
311
- alt="Interpolate start reference image."/>
312
- <p>Start Frame</p>
313
- </div>
314
- <div class="column interpolation-video-column">
315
- <div id="interpolation-image-wrapper">
316
- Loading...
317
- </div>
318
- <input class="slider is-fullwidth is-large is-info"
319
- id="interpolation-slider"
320
- step="1" min="0" max="100" value="0" type="range">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  </div>
322
- <div class="column is-3 has-text-centered">
323
- <img src="./static/images/interpolate_end.jpg"
324
- class="interpolation-image"
325
- alt="Interpolation end reference image."/>
326
- <p class="is-bold">End Frame</p>
327
- </div>
328
- </div>
329
- <br/>
330
- <!--/ Interpolating. -->
331
 
332
- <!-- Re-rendering. -->
333
- <h3 class="title is-4">Re-rendering the input video</h3>
334
- <div class="content has-text-justified">
335
  <p>
336
- Using <span class="dnerf">Nerfies</span>, you can re-render a video from a novel
337
- viewpoint such as a stabilized camera by playing back the training deformations.
338
  </p>
 
 
 
 
 
 
 
339
  </div>
340
- <div class="content has-text-centered">
341
- <video id="replay-video"
342
- controls
343
- muted
344
- preload
345
- playsinline
346
- width="75%">
347
- <source src="./static/videos/replay.mp4"
348
- type="video/mp4">
349
- </video>
350
- </div>
351
- <!--/ Re-rendering. -->
352
-
353
  </div>
354
  </div>
355
- <!--/ Animation. -->
356
-
357
 
358
- <!-- Concurrent Work. -->
 
359
  <div class="columns is-centered">
360
  <div class="column is-full-width">
361
- <h2 class="title is-3">Related Links</h2>
362
-
363
  <div class="content has-text-justified">
364
  <p>
365
- There's a lot of excellent work that was introduced around the same time as ours.
 
 
 
 
 
 
366
  </p>
367
  <p>
368
- <a href="https://arxiv.org/abs/2104.09125" target="_blank">Progressive Encoding for Neural Optimization</a> introduces an idea similar to our windowed position encoding for coarse-to-fine optimization.
369
- </p>
370
- <p>
371
- <a href="https://www.albertpumarola.com/research/D-NeRF/index.html" target="_blank">D-NeRF</a> and <a href="https://gvv.mpi-inf.mpg.de/projects/nonrigid_nerf/" target="_blank">NR-NeRF</a>
372
- both use deformation fields to model non-rigid scenes.
373
  </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  <p>
375
- Some works model videos with a NeRF by directly modulating the density, such as <a href="https://video-nerf.github.io/" target="_blank">Video-NeRF</a>, <a href="https://www.cs.cornell.edu/~zl548/NSFF/" target="_blank">NSFF</a>, and <a href="https://neural-3d-video.github.io/" target="_blank">DyNeRF</a>
 
 
 
 
 
376
  </p>
377
  <p>
378
- There are probably many more by the time you are reading this. Check out <a href="https://dellaert.github.io/NeRF/" target="_blank">Frank Dellart's survey on recent NeRF papers</a>, and <a href="https://github.com/yenchenlin/awesome-NeRF" target="_blank">Yen-Chen Lin's curated list of NeRF papers</a>.
 
 
 
 
 
 
 
 
379
  </p>
380
  </div>
381
  </div>
382
  </div>
383
- <!--/ Concurrent Work. -->
384
-
385
  </div>
386
  </section>
387
 
388
-
389
  <section class="section" id="BibTeX">
390
  <div class="container is-max-desktop content">
391
  <h2 class="title">BibTeX</h2>
392
- <pre><code>@article{park2021nerfies,
393
- author = {Park, Keunhong and Sinha, Utkarsh and Barron, Jonathan T. and Bouaziz, Sofien and Goldman, Dan B and Seitz, Steven M. and Martin-Brualla, Ricardo},
394
- title = {Nerfies: Deformable Neural Radiance Fields},
395
- journal = {ICCV},
396
- year = {2021},
397
  }</code></pre>
398
  </div>
399
  </section>
400
 
401
-
402
  <footer class="footer">
403
  <div class="container">
404
  <div class="content has-text-centered">
405
- <a class="icon-link" target="_blank"
406
- href="./static/videos/nerfies_paper.pdf">
407
  <i class="fas fa-file-pdf"></i>
408
  </a>
409
- <a class="icon-link" href="https://github.com/keunhong" target="_blank" class="external-link" disabled>
410
  <i class="fab fa-github"></i>
411
  </a>
412
  </div>
@@ -414,17 +515,10 @@
414
  <div class="column is-8">
415
  <div class="content">
416
  <p>
417
- This website is licensed under a <a rel="license" target="_blank"
418
- href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
419
  Commons Attribution-ShareAlike 4.0 International License</a>.
420
  </p>
421
- <p>
422
- This means you are free to borrow the <a target="_blank"
423
- href="https://github.com/nerfies/nerfies.github.io">source code</a> of this website,
424
- we just ask that you link back to this page in the footer.
425
- Please remember to remove the analytics code included in the header of the website which
426
- you do not want on your website.
427
- </p>
428
  </div>
429
  </div>
430
  </div>
 
3
  <head>
4
  <meta charset="utf-8">
5
  <meta name="description"
6
+ content="Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs.">
 
7
  <meta name="viewport" content="width=device-width, initial-scale=1">
8
+ <title>Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs</title>
9
 
10
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
11
  rel="stylesheet">
12
 
13
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.4/css/bulma.min.css">
14
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
 
 
15
  <link rel="stylesheet"
16
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
17
+
18
+ <style>
19
+ .publication-title {
20
+ font-family: 'Google Sans', sans-serif;
21
+ }
22
+ .publication-authors {
23
+ font-family: 'Google Sans', sans-serif;
24
+ }
25
+ .dnerf {
26
+ font-weight: bold;
27
+ color: #3273dc;
28
+ }
29
+
30
+ h1.title,
31
+ h2.title,
32
+ h3.title,
33
+ h2.subtitle,
34
+ h3.subtitle {
35
+ text-align: center;
36
+ }
37
+ .objective-list {
38
+ list-style-type: lower-roman;
39
+ padding-left: 1.5em;
40
+ }
41
+ .objective-title {
42
+ font-weight: bold;
43
+ }
44
+ </style>
45
 
46
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
47
+ <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
48
+
 
 
49
  </head>
50
  <body>
51
 
 
54
  <div class="container is-max-desktop">
55
  <div class="columns is-centered">
56
  <div class="column has-text-centered">
57
+ <h1 class="title is-1 publication-title">Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs</h1>
58
  <div class="is-size-5 publication-authors">
59
  <span class="author-block">
60
+ <a href="https://paulaoak.github.io/">Paula Cordero-Encinar</a><sup>1</sup>,</span>
 
 
 
 
 
 
 
 
61
  <span class="author-block">
62
+ <a href="https://www.ma.imperial.ac.uk/~aduncan/">Andrew B. Duncan</a><sup>1</sup></span>
 
 
 
 
 
 
 
63
  </div>
64
 
65
  <div class="is-size-5 publication-authors">
66
+ <span class="author-block"><sup>1</sup>Imperial College London</span>
 
67
  </div>
68
 
69
  <div class="column has-text-centered">
70
  <div class="publication-links">
71
  <!-- PDF Link. -->
72
  <span class="link-block">
73
+ <a href="https://arxiv.org/pdf/2510.17472"
74
  class="external-link button is-normal is-rounded is-dark">
75
  <span class="icon">
76
  <i class="fas fa-file-pdf"></i>
 
79
  </a>
80
  </span>
81
  <span class="link-block">
82
+ <a href="https://arxiv.org/abs/2510.17472"
83
  class="external-link button is-normal is-rounded is-dark">
84
  <span class="icon">
85
  <i class="ai ai-arxiv"></i>
 
87
  <span>arXiv</span>
88
  </a>
89
  </span>
 
 
 
 
 
 
 
 
 
 
90
  <!-- Code Link. -->
91
  <span class="link-block">
92
+ <a href="https://github.com/paulaoak/certified_self_consistency"
93
  class="external-link button is-normal is-rounded is-dark">
94
  <span class="icon">
95
  <i class="fab fa-github"></i>
 
97
  <span>Code</span>
98
  </a>
99
  </span>
 
 
 
 
 
 
 
 
 
100
  </div>
101
+ <div class="is-size-5 mt-3">
102
+ <span class="has-text-weight-bold">TLDR:</span> We provide a unified statistical framework of when and why self-consistency yields certifiable reliability in reasoning models, and how test-time adaptation can further reduce the computational cost of this certification.
103
+ </div>
104
  </div>
105
  </div>
106
  </div>
 
108
  </div>
109
  </section>
110
 
111
+ <section class="section" style="padding: 10px 0;">
112
+ <div class="container is-max-desktop">
113
+ <div class="columns is-centered">
114
+ <div class="column is-full-width">
115
+ <div class="content has-text-justified">
116
+ <img src="condorcet_framework.png" alt="Certified self-consistency workflow" style="width: 100%;">
117
+ <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
118
+ Given a prompt, the model generates multiple reasoning rollouts from the
119
+ reference distribution \(\pi_{\mathrm{ref}}(\cdot|{pr})\).
120
+ The resulting terminal answers are aggregated via majority voting, viewed
121
+ as mode estimation under sampling uncertainty.
122
+ The Martingale Majority Certificate (MMC) monitors the empirical margin and
123
+ provides an <em>anytime-valid</em> stopping rule for certification.
124
+ Test-time training with SNR or entropy-based adaptation sharpens the
125
+ terminal distribution, thereby increasing the
126
+ signal-to-noise ratio (SNR) and reducing the number of samples required for
127
+ certification.
128
+ </div>
129
+ <div style="text-align:center; margin: 24px 0;">
130
+ <img src="mmc_point_shared.gif" alt="MMC stopping rule in action" style="width: 80%;">
131
+ <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
132
+ MMC stopping rule in action.
133
+ </div>
134
+ </div>
135
+ </div>
136
  </div>
137
  </section>
138
+ <section class="hero">
139
+ <div class="container is-max-desktop">
140
+ <div class="hero-body">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  </div>
 
142
  </div>
143
  </section>
144
+
 
145
  <section class="section">
146
  <div class="container is-max-desktop">
147
  <!-- Abstract. -->
 
150
  <h2 class="title is-3">Abstract</h2>
151
  <div class="content has-text-justified">
152
  <p>
153
+ Recent advances such as self-consistency and test-time reinforcement learning (TTRL) improve the
154
+ reliability of large language models (LLMs) without additional supervision, yet their underlying
155
+ mechanisms and statistical guarantees remain poorly understood.
156
  </p>
157
  <p>
158
+ We present a unified framework for certifiable inference in LLMs, showing that majority voting provides a
159
+ statistical certificate of self-consistency: under mild assumptions, the aggregated answer coincides with
160
+ the mode of the model’s terminal distribution with high probability. We derive finite-sample and anytime-valid
161
+ concentration bounds that quantify this confidence, and introduce the Martingale Majority Certificate (MMC), a
162
+ sequential stopping rule that adaptively determines when sufficient samples have been drawn.
 
 
 
 
 
163
  </p>
164
  <p>
165
+ We further prove that label-free post-training methods such as TTRL implicitly sharpen the answer distribution
166
+ by exponentially tilting it toward its mode, thereby reducing the number of samples required for certification.
167
+ Building on this insight, we propose new post-training objectives that explicitly optimise this trade-off between
168
+ sharpness and bias. Together, these results explain and connect two central test-time scaling strategies,
169
+ self-consistency and TTRL, within a single statistical framework for label-free, certifiable reliability in
170
+ reasoning LLMs.
 
 
 
171
  </p>
172
  </div>
173
  </div>
174
  </div>
175
  <!--/ Abstract. -->
 
 
 
 
 
 
 
 
 
 
 
 
176
  </div>
177
  </section>
178
 
179
 
180
  <section class="section">
181
  <div class="container is-max-desktop">
 
182
  <div class="columns is-centered">
183
+ <div class="column is-full-width">
184
+ <h3 class="title is-4">Setting</h3>
185
+ <div class="content has-text-justified">
186
+ <p>
187
+ LLM rollouts can be formalised as a stochastic decoding process
188
+ \[
189
+ (Y_t)_{t \ge 0}, \quad Y_t \in \mathcal{V},
190
+ \]
191
+ where \( \mathcal{V} \) is the vocabulary and the process is initialised by a prompt \( pr \).
192
+ At each step the model samples
193
+ \[
194
+ Y_{t+1} \sim \pi_\phi(\cdot \mid Y_{\le t}, pr),
195
+ \]
196
+ from a conditional policy parametrised by weights \( \phi \).
197
+ The <em>thinking phase</em> consists of the random evolution of this sequence until a termination token is produced,
198
+ at which point the model emits the response, starting from a random stopping time \( \tau \).
199
+ We denote by
200
+ \[
201
+ X := g(Y_{\tau:}) \in \mathcal{A}
202
+ \]
203
+ the canonicalised terminal answer, obtained by applying a deterministic extraction map \( g \).
204
+ The induced terminal distribution \( \mathbf{p} = \mathrm{Law}(X) \) over the answer set \( \mathcal{A} \) captures the model’s epistemic uncertainty about its own final output.
205
+ In an ideal reasoning model, we would like rollouts to exhibit rich variability in \( Y_{1:\tau-1} \) (the reasoning trajectories), yet concentrate mass in the final answer \( X \) (the outcome).
206
+ That is, we seek <em>diversity over reasoning paths, but consistency over terminal responses</em>.
207
+ </p>
208
+
209
+ <p>
210
+ In supervised or verifier-equipped settings, correctness can be externally validated.
211
+ In open-ended reasoning tasks, such supervision is unavailable.
212
+ In the absence of external rewards, a model must act relative to its own uncertainty.
213
+ Letting \( a \in \mathcal{A} \) denote the chosen output and \( X \sim \mathbf{p} \) the stochastic model response, the expected 0–1 loss is \( \mathbb{E}[1\{a \neq X\}] \).
214
+ The Bayes-optimal decision minimising this loss is the mode
215
+ </p>
216
+
217
+ <p>
218
+ \[
219
+ c^\star = \arg\max_j p_j,
220
+ \]
221
+ </p>
222
+
223
+ <p>
224
+ which corresponds to the model’s most probable self-consistent answer.
225
+ Hence, under symmetric loss, recovering the mode is the optimal <em>model-relative</em> prediction.
226
+ When a verifier is absent, certifying that a model’s reported answer coincides with this mode provides a natural measure of reliability.
227
+ </p>
228
+ </div>
229
 
230
+ <h3 class="title is-4">Statistical Certificates of Self-Consistency</h3>
231
+ <div class="content has-text-justified">
232
+ <p>
233
+ In practice, the terminal probabilities \( \mathbf{p} \) are unknown and can be estimated only through multiple
234
+ independent rollouts \( X_1,\ldots,X_n \).
235
+ The simplest estimator of the mode is the <em>majority vote</em>
236
+ </p>
237
+
238
+ <p>
239
+ \[
240
+ \widehat{c}_n := \arg\max_j \hat{p}_{n,j},
241
+ \qquad
242
+ \hat{p}_{n,j} = \frac{1}{n}\sum_{i=1}^{n}\mathbf{1}\{X_i=j\}.
243
+ \]
244
+ </p>
245
+
246
+ <p>
247
+ This estimator forms the basis of <em>self-consistency</em> test-time scaling.
248
+ From a statistical standpoint, majority voting is the Bayes-optimal estimator of \( c^\star \) under 0--1 loss,
249
+ and an associated upper bound on \( \mathbb{P}[\widehat{c}_n \neq c^\star] \) provides a
250
+ <em>statistical certificate of self-consistency</em>: a quantitative guarantee that the aggregated answer
251
+ coincides with the mode of the terminal law \( \mathbf{p} \) with high probability.
252
+ </p>
253
+
254
+ <p>
255
+ Under standard regularity conditions the majority-vote estimator is consistent, \( \Pr[\widehat{c}_n = c^\star] \to 1 \) as \( n \to \infty \).
256
+ <strong>A more practical question concerns the finite-sample regime: how large must \( n \) be to guarantee, with
257
+ confidence \( 1-\varepsilon \), that \( \widehat{c}_n \) already equals \( c^\star \)?</strong>
258
+ </p>
259
+
260
+ <p>
261
+ To address this, we derive finite-sample and asymptotic certificates, leveraging Hoeffding, Bernstein,
262
+ Chernoff–Markov, and Sanov concentration bounds for the error probability \( \mathbb{P}[\widehat{c}_n \neq c^\star] \).
263
+ These bounds clarify how reliability scales with the ensemble size and with the <em>mode margin</em>
264
+ \( \delta = p_{c^\star} - p_{j^\star} \), i.e., the gap between the top two answer probabilities.
265
+ </p>
266
+
267
+ <p>
268
+ If the probabilities \( p_j \) were known, one could invert these bounds to determine the number of samples required
269
+ to achieve a desired confidence \( 1-\varepsilon \).
270
+ In reality, both \( p_j \) and \( \delta \) must be estimated on the fly.
271
+ This motivates a <em>sequential</em> formulation: <strong>as rollouts arrive, can we determine adaptively when the current majority
272
+ is statistically reliable?</strong>
273
+
274
+ We introduce the <em>Martingale Majority Certificate (MMC)</em>, a sequential procedure that adaptively tests whether the empirical leader remains significantly ahead of its nearest rival and
275
+ of all others combined. This guarantees that at the (random) stopping time \( \tau \), majority vote coincides with the true mode with high probability:
276
+ </p>
277
+
278
+ <p>
279
+ \[
280
+ \Pr[\widehat{c}_{n_\tau} \neq c^\star] \le \varepsilon,
281
+ \]
282
+ </p>
283
+
284
+ <p>
285
+ thus providing an <em>anytime-valid certificate</em> of model self-consistency.
286
+ </p>
287
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ <h3 class="title is-4">Martingale Majority Certificate Stopping Rule</h3>
290
+ <div class="content has-text-justified">
291
+ <p>
292
+ Our proposed stopping rule adaptively decides when to stop sampling rollouts while controlling the error of returning the empirical majority.
293
+ </p>
294
+ <p>
295
+ The central challenge in the LLM setting is the potentially large number of possible outcomes.
296
+ A naive stopping rule would require pairwise comparisons of the empirical probabilities across all classes
297
+ \( i \neq j \), \( i,j \in \{1, \dots, k\} \), which becomes computationally prohibitive as \( k \) grows.
298
+ </p>
299
+
300
+ <p>
301
+ To address this, we exploit the observation that the mass of the terminal law is typically concentrated on a few classes \( m \ll k \).
302
+ Thus, instead of considering all classes individually, we aggregate votes into three categories:
303
+ <ul>
304
+ <li>the current leader \( \widehat{c}_n \),</li>
305
+ <li>the runner-up</li>
306
+ <li>all the <em>others</em>.</li>
307
+ </ul>
308
+ </p>
309
+ <p>
310
+ Accordingly, we perform two tests: leader vs runner-up and leader vs <em>others</em>.
311
+ </p>
312
+ <div style="text-align:center; margin: 24px 0;">
313
+ <img src="mmc_algorithm.png" alt="MMC algorithm" width="70%">
314
+ </div>
315
  </div>
316
  </div>
317
  </div>
318
+ </div>
319
+ </section>
320
 
321
+ <section class="section">
322
+ <div class="container is-max-desktop">
323
  <div class="columns is-centered">
324
  <div class="column is-full-width">
325
+ <h3 class="title is-4">Optimising Sample Efficiency with Test-Time Training</h3>
 
 
 
326
  <div class="content has-text-justified">
327
  <p>
328
+ Our ultimate goal is to minimise the number of samples required from the LLM for the majority vote
329
+ to return the correct answer with high confidence \(1-\varepsilon\). The expected stopping time of the MMC scales approximately as
330
+ <span id="eq-expected_number_samples">
331
+ \[
332
+ N \;\approx\;
333
+ \frac{2(p_{\hat c}+p_{j^\star})}{(p_{\hat c}-p_{j^\star})^{2}} \,\log \frac{1}{\varepsilon},
334
+ \]
335
+ </span>
336
+ so that small mode margins
337
+ <span>\( \delta = p_{\hat c}-p_{j^\star} \)</span>
338
+ lead to rapidly increasing sample requirements.
339
+ </p>
340
+ <p>
341
+ <strong>The key question is whether test-time adaptation can reshape the terminal distribution to enlarge this margin, thereby improving sample efficiency.</strong>
342
+ </p>
343
+ <p>
344
+ We show that the optimal policy corresponding to the KL-regularised objective proposed in <a href="https://arxiv.org/pdf/2504.16084">TTRL</a> is an exponentially tilted version of the base model.
345
+ Decreasing the regularisation parameter consistently increases the margin and reduces the number of samples required for certification.
346
  </p>
347
+ <p><strong style="font-size: 1.3em;">Two new test-time RL objectives</strong></p>
348
+
349
+ <p>
350
+ We introduce two label-free group-level rewards designed to optimise the trade-off between sharpness
351
+ and bias. Let \( \mathbf{X} = (X_1, \dots, X_n) \) be a set of answers arising from rollouts
352
+ \( \mathbf{Y} =(Y_1, \ldots, Y_n) \) for a given prompt, with \( \widehat{c}_n \) denoting the majority vote
353
+ and \( j_n^\star \) the runner-up. Define \( N_j = \sum_i \mathbf{1}\{X_i=j\} \).
354
+ </p>
355
+
356
+ <ol class="objective-list">
357
+ <li>
358
+ <span class="objective-title">SNR-based reward.</span>
359
+ <p>
360
+ Directly leveraging the SNR as a driving factor in the efficiency of the MMC scheme we introduce the first reward
361
+ </p>
362
+ <p>
363
+ \[
364
+ r^{(1)}_n(\mathbf{Y})
365
+ = \widehat{\mathrm{SNR}}(\Delta_{j^\star_n})(\mathbf{X})
366
+ = \frac{(N_{\widehat c_n}-N_{j^\star_n})^{2}}
367
+ {n \left(N_{\widehat c_n}+N_{j^\star_n}\right)
368
+ -(N_{\widehat c_n}-N_{j^\star_n})^{2}}
369
+ \;\xrightarrow[n\to\infty]{}\;
370
+ \mathrm{SNR}(\Delta_{j^\star_n}).
371
+ \]
372
+ </p>
373
+ <p>
374
+ This objective aims to directly maximise \( \text{SNR}(\Delta_{j_n^\star}) \), which is equivalent to minimising the expected
375
+ number of samples required to obtain statistical certificates for the majority vote.
376
+ </p>
377
+ </li>
378
+
379
+ <li>
380
+ <span class="objective-title">Entropy-based reward.</span>
381
+ <p>
382
+ As we want to encourage a more peaked terminal distribution, another natural option is negative entropy, i.e.
383
+ </p>
384
+ <p>
385
+ \[
386
+ r^{(2)}_n(\mathbf{Y})
387
+ = \widehat H_n(\mathbf{X})
388
+ = \sum_{j:N_j>0}\frac{N_j}{n} \log \frac{N_j}{n}
389
+ \;\xrightarrow[n\to\infty]{}\;
390
+ \sum_j p_j \log p_j = -H(p).
391
+ \]
392
+ </p>
393
+ <p>
394
+ Maximising \( \widehat H_n \) <em>minimises</em> the Shannon entropy of the answer
395
+ distribution, encouraging a sharper, lower-entropy terminal distribution.
396
+ 🚨<strong>Important:</strong> The tempering sharpens only the distribution of final answers, not the full sequence distribution.
397
+ This gives us the best of both worlds: promoting certainty when providing a final answer, but permitting exploration of diverse
398
+ pathways during the chain-of-thought reasoning process.
399
+ </p>
400
+ </li>
401
+ </ol>
402
+ <div style="text-align:center; margin: 24px 0;">
403
+ <img src="ttt_performance_math500.png" alt="Performance TTT" width="100%">
404
+ <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
405
+ Pass@1 performance after test-time training with SNR and entropy-based rewards relative to the base models.
406
+ </figcaption>
407
  </div>
 
 
 
 
 
 
 
 
 
408
 
 
 
 
409
  <p>
410
+ We observe in the table below that the number of samples required under the MMC stopping rule decreases after applying test-time training, relative to the pre-trained model.
411
+ That is, test-time training sharpens the terminal answer distribution, increasing the mode margin and thus reducing the number of samples required for certification.
412
  </p>
413
+ <div style="text-align:center; margin: 24px 0;">
414
+ <img src="table_mmc.png" alt="Performance TTT" width="75%">
415
+ <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
416
+ Majority vote accuracy and required number of samples under the MMC stopping rule (✅) at confidence levels 0.1 and 0.4 for the pre-trained model and after test-time training with SNR-based rewards. Performance is compared to that obtained using the full sample budget (❌).
417
+ </figcaption>
418
+ </div>
419
+ </ul>
420
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  </div>
422
  </div>
423
+ </div>
424
+ </section>
425
 
426
+ <section class="section">
427
+ <div class="container is-max-desktop">
428
  <div class="columns is-centered">
429
  <div class="column is-full-width">
430
+ <h3 class="title is-4">SNR as a label-free estimator of task difficulty</h3>
 
431
  <div class="content has-text-justified">
432
  <p>
433
+ Our experiments reveal a notable empirical regularity: the
434
+ <em>signal-to-noise ratio</em> (SNR) of the margin variable
435
+ \(\Delta_{j^\star} = \mathbf 1\{X = c^\star\} - \mathbf 1\{X = j^\star\}\),
436
+ which quantifies the sharpness of the model’s terminal answer distribution,
437
+ correlates strongly with external measures of problem difficulty.
438
+ Across the MATH-500 benchmark, harder problems exhibit systematically lower and more variable SNR values,
439
+ while easier problems yield sharply peaked distributions concentrated around a single answer.
440
  </p>
441
  <p>
442
+ This behaviour is non-trivial: the model has no access to ground-truth difficulty labels, yet its own epistemic
443
+ uncertainty, reflected in the variability of its rollouts, aligns closely with these labels.
444
+ <strong>This suggests an emergent form of calibration in reasoning LLMs</strong>:
445
+ without explicit supervision or external verification, models appear to ''know when they do not know.''
446
+ In statistical terms, the SNR acts as a label-free proxy for epistemic uncertainty and, consequently, for task difficulty.
447
  </p>
448
+ <div style="text-align:center; margin: 24px 0;">
449
+ <img src="QWEN-MATH-1.5B_violin_maj100_SNR.png" alt="SNR distribution qwen-math-1.5B." style="width: 48%;margin-right: 1%;">
450
+ <img src="QWEN-MATH-7B_violin_maj100_SNR.png" alt="SNR distribution qwen-math-7B." style="width: 48%;margin-left: 1%;">
451
+ <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
452
+ Distribution of the estimated SNR when using MMC stopping rule with \(\varepsilon = 0.1\) and \(N_{\text{budget}}=100\). Results are obtained after applying test-time training with SNR-based rewards.</figcaption>
453
+ </div>
454
+ </div>
455
+ </div>
456
+ </div>
457
+ </div>
458
+ </section>
459
+
460
+ <section class="section">
461
+ <div class="container is-max-desktop">
462
+ <div class="columns is-centered">
463
+ <div class="column is-full-width">
464
+ <h3 class="title is-4">Conclusion</h3>
465
+
466
+ <div class="content has-text-justified">
467
  <p>
468
+ <strong>Our results unify several strands of recent work on reliable inference in LLMs, self-consistency,
469
+ adaptive compute allocation, and test-time reinforcement learning (TTRL), under a common
470
+ statistical perspective.</strong> Through this lens, majority voting emerges naturally as a means of estimating the mode of the terminal distribution.
471
+ The validity of the majority vote as an estimate of the mode can be certified by finite-sample and asymptotic bounds. The Martingale Majority Certificate (MMC)
472
+ extends this view by providing an operational test-time algorithm that determines, from model
473
+ rollouts alone, when a response is statistically self-consistent.
474
  </p>
475
  <p>
476
+ Furthermore, <strong>we shed light on the underlying mechanism by which TTRL and related post-training
477
+ approaches improve reasoning reliability: KL-regularised optimisation corresponds to an
478
+ exponential tilting of the terminal law, sharpening it around its mode and increasing the
479
+ signal-to-noise ratio (SNR) of the margin variable.</strong> This insight explains empirical observations of
480
+ enhanced consistency after test-time adaptation, and motivates new label-free objectives such as
481
+ our SNR- and entropy-based rewards, which explicitly target this trade-off between sharpness and
482
+ bias. Unlike prior work that tunes temperature or per-token distributions, our formulation operates
483
+ on the terminal marginal, preserving exploration during reasoning while promoting confidence in the
484
+ final answer.
485
  </p>
486
  </div>
487
  </div>
488
  </div>
 
 
489
  </div>
490
  </section>
491
 
 
492
  <section class="section" id="BibTeX">
493
  <div class="container is-max-desktop content">
494
  <h2 class="title">BibTeX</h2>
495
+ <pre><code>@article{corderoencinar2025certified,
496
+ author = {Paula Cordero-Encinar and Andrew B. Duncan},
497
+ title = {Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs},
498
+ journal = {arXiv:2510.17472},
499
+ year = {2025},
500
  }</code></pre>
501
  </div>
502
  </section>
503
 
 
504
  <footer class="footer">
505
  <div class="container">
506
  <div class="content has-text-centered">
507
+ <a class="icon-link" href="https://arxiv.org/pdf/2510.17472" class="external-link">
 
508
  <i class="fas fa-file-pdf"></i>
509
  </a>
510
+ <a class="icon-link" href="https://github.com/paulaoak/certified_self_consistency" class="external-link">
511
  <i class="fab fa-github"></i>
512
  </a>
513
  </div>
 
515
  <div class="column is-8">
516
  <div class="content">
517
  <p>
518
+ This website template is borrowed from <a href="https://nerfies.github.io/">Nerfies</a>,
519
+ licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
520
  Commons Attribution-ShareAlike 4.0 International License</a>.
521
  </p>
 
 
 
 
 
 
 
522
  </div>
523
  </div>
524
  </div>