Spaces:

paulaoak
/

certified_self_consistency

Running

App Files Files Community

paulaoak commited on Nov 2

Commit

8932df5

verified ·

1 Parent(s): 90eb0d1

Update index.html

Browse files

Files changed (1) hide show

index.html +368 -274

index.html CHANGED Viewed

@@ -3,28 +3,49 @@
 <head>
   <meta charset="utf-8">
   <meta name="description"
-        content="Deformable Neural Radiance Fields creates free-viewpoint portraits (nerfies) from casually captured videos.">
-  <meta name="keywords" content="Nerfies, D-NeRF, NeRF">
   <meta name="viewport" content="width=device-width, initial-scale=1">
-  <title>Nerfies: Deformable Neural Radiance Fields</title>
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
         rel="stylesheet">
-  <link rel="stylesheet" href="./static/css/bulma.min.css">
-  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
-  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
-  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
   <link rel="stylesheet"
         href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
-  <link rel="stylesheet" href="./static/css/index.css">
-  <link rel="icon" href="./static/images/favicon.svg">
   <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
-  <script defer src="./static/js/fontawesome.all.min.js"></script>
-  <script src="./static/js/bulma-carousel.min.js"></script>
-  <script src="./static/js/bulma-slider.min.js"></script>
-  <script src="./static/js/index.js"></script>
 </head>
 <body>
@@ -33,39 +54,23 @@
     <div class="container is-max-desktop">
       <div class="columns is-centered">
         <div class="column has-text-centered">
-          <h1 class="title is-1 publication-title">Nerfies: Deformable Neural Radiance Fields</h1>
           <div class="is-size-5 publication-authors">
             <span class="author-block">
-              <a href="https://keunhong.com" target="_blank">Keunhong Park</a><sup>1</sup>,</span>
-            <span class="author-block">
-              <a href="https://utkarshsinha.com" target="_blank">Utkarsh Sinha</a><sup>2</sup>,</span>
-            <span class="author-block">
-              <a href="https://jonbarron.info" target="_blank">Jonathan T. Barron</a><sup>2</sup>,
-            </span>
-            <span class="author-block">
-              <a href="http://sofienbouaziz.com" target="_blank">Sofien Bouaziz</a><sup>2</sup>,
-            </span>
             <span class="author-block">
-              <a href="https://www.danbgoldman.com" target="_blank">Dan B Goldman</a><sup>2</sup>,
-            </span>
-            <span class="author-block">
-              <a href="https://homes.cs.washington.edu/~seitz/" target="_blank">Steven M. Seitz</a><sup>1,2</sup>,
-            </span>
-            <span class="author-block">
-              <a href="http://www.ricardomartinbrualla.com" target="_blank">Ricardo Martin-Brualla</a><sup>2</sup>
-            </span>
           </div>
           <div class="is-size-5 publication-authors">
-            <span class="author-block"><sup>1</sup>University of Washington,</span>
-            <span class="author-block"><sup>2</sup>Google Research</span>
           </div>
           <div class="column has-text-centered">
             <div class="publication-links">
               <!-- PDF Link. -->
               <span class="link-block">
-                <a href="https://arxiv.org/pdf/2011.12948" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="fas fa-file-pdf"></i>
@@ -74,7 +79,7 @@
                 </a>
               </span>
               <span class="link-block">
-                <a href="https://arxiv.org/abs/2011.12948" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="ai ai-arxiv"></i>
@@ -82,19 +87,9 @@
                   <span>arXiv</span>
                 </a>
               </span>
-              <!-- Video Link. -->
-              <span class="link-block">
-                <a href="https://www.youtube.com/watch?v=MrKrnHhk8IA" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="fab fa-youtube"></i>
-                  </span>
-                  <span>Video</span>
-                </a>
-              </span>
               <!-- Code Link. -->
               <span class="link-block">
-                <a href="https://github.com/google/nerfies" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="fab fa-github"></i>
@@ -102,17 +97,10 @@
                   <span>Code</span>
                   </a>
               </span>
-              <!-- Dataset Link. -->
-              <span class="link-block">
-                <a href="https://github.com/google/nerfies/releases/tag/0.1" target="_blank"
-                   class="external-link button is-normal is-rounded is-dark">
-                  <span class="icon">
-                      <i class="far fa-images"></i>
-                  </span>
-                  <span>Data</span>
-                  </a>
             </div>
           </div>
         </div>
       </div>
@@ -120,81 +108,40 @@
   </div>
 </section>
-<section class="hero teaser">
-  <div class="container is-max-desktop">
-    <div class="hero-body">
-      <video id="teaser" autoplay muted loop playsinline height="100%">
-        <source src="./static/videos/teaser.mp4"
-                type="video/mp4">
-      </video>
-      <h2 class="subtitle has-text-centered">
-        <span class="dnerf">Nerfies</span> turns selfie videos from your phone into
-        free-viewpoint
-        portraits.
-      </h2>
-    </div>
   </div>
 </section>
-<section class="hero is-light is-small">
-  <div class="hero-body">
-    <div class="container">
-      <div id="results-carousel" class="carousel results-carousel">
-        <div class="item item-steve">
-          <video poster="" id="steve" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/steve.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-chair-tp">
-          <video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/chair-tp.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-shiba">
-          <video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/shiba.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-fullbody">
-          <video poster="" id="fullbody" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/fullbody.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-blueshirt">
-          <video poster="" id="blueshirt" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/blueshirt.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-mask">
-          <video poster="" id="mask" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/mask.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-coffee">
-          <video poster="" id="coffee" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/coffee.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <div class="item item-toby">
-          <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/toby2.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
       </div>
-    </div>
   </div>
 </section>
 <section class="section">
   <div class="container is-max-desktop">
     <!-- Abstract. -->
@@ -203,210 +150,364 @@
         <h2 class="title is-3">Abstract</h2>
         <div class="content has-text-justified">
           <p>
-            We present the first method capable of photorealistically reconstructing a non-rigidly
-            deforming scene using photos/videos captured casually from mobile phones.
           </p>
           <p>
-            Our approach augments neural radiance fields
-            (NeRF) by optimizing an
-            additional continuous volumetric deformation field that warps each observed point into a
-            canonical 5D NeRF.
-            We observe that these NeRF-like deformation fields are prone to local minima, and
-            propose a coarse-to-fine optimization method for coordinate-based models that allows for
-            more robust optimization.
-            By adapting principles from geometry processing and physical simulation to NeRF-like
-            models, we propose an elastic regularization of the deformation field that further
-            improves robustness.
           </p>
           <p>
-            We show that <span class="dnerf">Nerfies</span> can turn casually captured selfie
-            photos/videos into deformable NeRF
-            models that allow for photorealistic renderings of the subject from arbitrary
-            viewpoints, which we dub <i>"nerfies"</i>. We evaluate our method by collecting data
-            using a
-            rig with two mobile phones that take time-synchronized photos, yielding train/validation
-            images of the same pose at different viewpoints. We show that our method faithfully
-            reconstructs non-rigidly deforming scenes and reproduces unseen views with high
-            fidelity.
           </p>
         </div>
       </div>
     </div>
     <!--/ Abstract. -->
-    <!-- Paper video. -->
-    <div class="columns is-centered has-text-centered">
-      <div class="column is-four-fifths">
-        <h2 class="title is-3">Video</h2>
-        <div class="publication-video">
-          <iframe src="https://www.youtube.com/embed/MrKrnHhk8IA?rel=0&amp;showinfo=0"
-                  frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
-        </div>
-      </div>
-    </div>
-    <!--/ Paper video. -->
   </div>
 </section>
 <section class="section">
   <div class="container is-max-desktop">
     <div class="columns is-centered">
-      <!-- Visual Effects. -->
-      <div class="column">
-        <div class="content">
-          <h2 class="title is-3">Visual Effects</h2>
-          <p>
-            Using <i>nerfies</i> you can create fun visual effects. This Dolly zoom effect
-            would be impossible without nerfies since it would require going through a wall.
-          </p>
-          <video id="dollyzoom" autoplay controls muted loop playsinline height="100%">
-            <source src="./static/videos/dollyzoom-stacked.mp4"
-                    type="video/mp4">
-          </video>
         </div>
-      </div>
-      <!--/ Visual Effects. -->
-      <!-- Matting. -->
-      <div class="column">
-        <h2 class="title is-3">Matting</h2>
-        <div class="columns is-centered">
-          <div class="column content">
-            <p>
-              As a byproduct of our method, we can also solve the matting problem by ignoring
-              samples that fall outside of a bounding box during rendering.
-            </p>
-            <video id="matting-video" controls playsinline height="100%">
-              <source src="./static/videos/matting.mp4"
-                      type="video/mp4">
-            </video>
-          </div>
         </div>
       </div>
     </div>
-    <!--/ Matting. -->
-    <!-- Animation. -->
     <div class="columns is-centered">
       <div class="column is-full-width">
-        <h2 class="title is-3">Animation</h2>
-        <!-- Interpolating. -->
-        <h3 class="title is-4">Interpolating states</h3>
         <div class="content has-text-justified">
           <p>
-            We can also animate the scene by interpolating the deformation latent codes of two input
-            frames. Use the slider here to linearly interpolate between the left frame and the right
-            frame.
           </p>
-        </div>
-        <div class="columns is-vcentered interpolation-panel">
-          <div class="column is-3 has-text-centered">
-            <img src="./static/images/interpolate_start.jpg"
-                 class="interpolation-image"
-                 alt="Interpolate start reference image."/>
-            <p>Start Frame</p>
-          </div>
-          <div class="column interpolation-video-column">
-            <div id="interpolation-image-wrapper">
-              Loading...
-            </div>
-            <input class="slider is-fullwidth is-large is-info"
-                   id="interpolation-slider"
-                   step="1" min="0" max="100" value="0" type="range">
           </div>
-          <div class="column is-3 has-text-centered">
-            <img src="./static/images/interpolate_end.jpg"
-                 class="interpolation-image"
-                 alt="Interpolation end reference image."/>
-            <p class="is-bold">End Frame</p>
-          </div>
-        </div>
-        <br/>
-        <!--/ Interpolating. -->
-        <!-- Re-rendering. -->
-        <h3 class="title is-4">Re-rendering the input video</h3>
-        <div class="content has-text-justified">
           <p>
-            Using <span class="dnerf">Nerfies</span>, you can re-render a video from a novel
-            viewpoint such as a stabilized camera by playing back the training deformations.
           </p>
         </div>
-        <div class="content has-text-centered">
-          <video id="replay-video"
-                 controls
-                 muted
-                 preload
-                 playsinline
-                 width="75%">
-            <source src="./static/videos/replay.mp4"
-                    type="video/mp4">
-          </video>
-        </div>
-        <!--/ Re-rendering. -->
       </div>
     </div>
-    <!--/ Animation. -->
-    <!-- Concurrent Work. -->
     <div class="columns is-centered">
       <div class="column is-full-width">
-        <h2 class="title is-3">Related Links</h2>
         <div class="content has-text-justified">
           <p>
-            There's a lot of excellent work that was introduced around the same time as ours.
           </p>
           <p>
-            <a href="https://arxiv.org/abs/2104.09125" target="_blank">Progressive Encoding for Neural Optimization</a> introduces an idea similar to our windowed position encoding for coarse-to-fine optimization.
-          </p>
-          <p>
-            <a href="https://www.albertpumarola.com/research/D-NeRF/index.html" target="_blank">D-NeRF</a> and <a href="https://gvv.mpi-inf.mpg.de/projects/nonrigid_nerf/" target="_blank">NR-NeRF</a>
-            both use deformation fields to model non-rigid scenes.
           </p>
           <p>
-            Some works model videos with a NeRF by directly modulating the density, such as <a href="https://video-nerf.github.io/" target="_blank">Video-NeRF</a>, <a href="https://www.cs.cornell.edu/~zl548/NSFF/" target="_blank">NSFF</a>, and <a href="https://neural-3d-video.github.io/" target="_blank">DyNeRF</a>
           </p>
           <p>
-            There are probably many more by the time you are reading this. Check out <a href="https://dellaert.github.io/NeRF/" target="_blank">Frank Dellart's survey on recent NeRF papers</a>, and <a href="https://github.com/yenchenlin/awesome-NeRF" target="_blank">Yen-Chen Lin's curated list of NeRF papers</a>.
           </p>
         </div>
       </div>
     </div>
-    <!--/ Concurrent Work. -->
   </div>
 </section>
 <section class="section" id="BibTeX">
   <div class="container is-max-desktop content">
     <h2 class="title">BibTeX</h2>
-    <pre><code>@article{park2021nerfies,
-  author    = {Park, Keunhong and Sinha, Utkarsh and Barron, Jonathan T. and Bouaziz, Sofien and Goldman, Dan B and Seitz, Steven M. and Martin-Brualla, Ricardo},
-  title     = {Nerfies: Deformable Neural Radiance Fields},
-  journal   = {ICCV},
-  year      = {2021},
 }</code></pre>
   </div>
 </section>
 <footer class="footer">
   <div class="container">
     <div class="content has-text-centered">
-      <a class="icon-link" target="_blank"
-         href="./static/videos/nerfies_paper.pdf">
         <i class="fas fa-file-pdf"></i>
       </a>
-      <a class="icon-link" href="https://github.com/keunhong" target="_blank" class="external-link" disabled>
         <i class="fab fa-github"></i>
       </a>
     </div>
@@ -414,17 +515,10 @@
       <div class="column is-8">
         <div class="content">
           <p>
-            This website is licensed under a <a rel="license" target="_blank"
-                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
             Commons Attribution-ShareAlike 4.0 International License</a>.
           </p>
-          <p>
-            This means you are free to borrow the <a target="_blank"
-              href="https://github.com/nerfies/nerfies.github.io">source code</a> of this website,
-            we just ask that you link back to this page in the footer.
-            Please remember to remove the analytics code included in the header of the website which
-            you do not want on your website.
-          </p>
         </div>
       </div>
     </div>

 <head>
   <meta charset="utf-8">
   <meta name="description"
+        content="Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs.">
   <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs</title>
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
         rel="stylesheet">
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.4/css/bulma.min.css">
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
   <link rel="stylesheet"
         href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
+  <style>
+    .publication-title {
+      font-family: 'Google Sans', sans-serif;
+    }
+    .publication-authors {
+      font-family: 'Google Sans', sans-serif;
+    }
+    .dnerf {
+      font-weight: bold;
+      color: #3273dc;
+    }
+    h1.title,
+    h2.title,
+    h3.title,
+    h2.subtitle,
+    h3.subtitle {
+      text-align: center;
+    }
+        .objective-list {
+      list-style-type: lower-roman;
+      padding-left: 1.5em;
+    }
+    .objective-title {
+      font-weight: bold;
+    }
+  </style>
   <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
 </head>
 <body>
     <div class="container is-max-desktop">
       <div class="columns is-centered">
         <div class="column has-text-centered">
+          <h1 class="title is-1 publication-title">Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs</h1>
           <div class="is-size-5 publication-authors">
             <span class="author-block">
+              <a href="https://paulaoak.github.io/">Paula Cordero-Encinar</a><sup>1</sup>,</span>
             <span class="author-block">
+              <a href="https://www.ma.imperial.ac.uk/~aduncan/">Andrew B. Duncan</a><sup>1</sup></span>
           </div>
           <div class="is-size-5 publication-authors">
+            <span class="author-block"><sup>1</sup>Imperial College London</span>
           </div>
           <div class="column has-text-centered">
             <div class="publication-links">
               <!-- PDF Link. -->
               <span class="link-block">
+                <a href="https://arxiv.org/pdf/2510.17472"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="fas fa-file-pdf"></i>
                 </a>
               </span>
               <span class="link-block">
+                <a href="https://arxiv.org/abs/2510.17472"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="ai ai-arxiv"></i>
                   <span>arXiv</span>
                 </a>
               </span>
               <!-- Code Link. -->
               <span class="link-block">
+                <a href="https://github.com/paulaoak/certified_self_consistency"
                    class="external-link button is-normal is-rounded is-dark">
                   <span class="icon">
                       <i class="fab fa-github"></i>
                   <span>Code</span>
                   </a>
               </span>
             </div>
+<div class="is-size-5 mt-3">
+                <span class="has-text-weight-bold">TLDR:</span> We provide a unified statistical framework of when and why self-consistency yields certifiable reliability in reasoning models, and how test-time adaptation can further reduce the computational cost of this certification.
+              </div>
           </div>
         </div>
       </div>
   </div>
 </section>
+<section class="section" style="padding: 10px 0;">
+<div class="container is-max-desktop">
+<div class="columns is-centered">
+<div class="column is-full-width">
+  <div class="content has-text-justified">
+    <img src="condorcet_framework.png" alt="Certified self-consistency workflow" style="width: 100%;">
+    <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
+    Given a prompt, the model generates multiple reasoning rollouts from the
+    reference distribution \(\pi_{\mathrm{ref}}(\cdot|{pr})\).
+    The resulting terminal answers are aggregated via majority voting, viewed
+    as mode estimation under sampling uncertainty.
+    The Martingale Majority Certificate (MMC) monitors the empirical margin and
+    provides an <em>anytime-valid</em> stopping rule for certification.
+    Test-time training with SNR or entropy-based adaptation sharpens the
+    terminal distribution, thereby increasing the
+    signal-to-noise ratio (SNR) and reducing the number of samples required for
+    certification.
+  </div>
+  <div style="text-align:center; margin: 24px 0;">
+    <img src="mmc_point_shared.gif" alt="MMC stopping rule in action" style="width: 80%;">
+    <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
+    MMC stopping rule in action.
+  </div>
+  </div>
+  </div>
   </div>
 </section>
+<section class="hero">
+  <div class="container is-max-desktop">
+    <div class="hero-body">
       </div>
   </div>
 </section>
 <section class="section">
   <div class="container is-max-desktop">
     <!-- Abstract. -->
         <h2 class="title is-3">Abstract</h2>
         <div class="content has-text-justified">
           <p>
+          Recent advances such as self-consistency and test-time reinforcement learning (TTRL) improve the
+          reliability of large language models (LLMs) without additional supervision, yet their underlying
+          mechanisms and statistical guarantees remain poorly understood.
           </p>
           <p>
+          We present a unified framework for certifiable inference in LLMs, showing that majority voting provides a
+          statistical certificate of self-consistency: under mild assumptions, the aggregated answer coincides with
+          the mode of the model’s terminal distribution with high probability. We derive finite-sample and anytime-valid
+          concentration bounds that quantify this confidence, and introduce the Martingale Majority Certificate (MMC), a
+          sequential stopping rule that adaptively determines when sufficient samples have been drawn.
           </p>
           <p>
+          We further prove that label-free post-training methods such as TTRL implicitly sharpen the answer distribution
+          by exponentially tilting it toward its mode, thereby reducing the number of samples required for certification.
+          Building on this insight, we propose new post-training objectives that explicitly optimise this trade-off between
+          sharpness and bias.  Together, these results explain and connect two central test-time scaling strategies,
+          self-consistency and TTRL,  within a single statistical framework for label-free, certifiable reliability in
+          reasoning LLMs.
           </p>
         </div>
       </div>
     </div>
     <!--/ Abstract. -->
   </div>
 </section>
 <section class="section">
   <div class="container is-max-desktop">
     <div class="columns is-centered">
+      <div class="column is-full-width">
+        <h3 class="title is-4">Setting</h3>
+        <div class="content has-text-justified">
+        <p>
+    LLM rollouts can be formalised as a stochastic decoding process
+    \[
+      (Y_t)_{t \ge 0}, \quad Y_t \in \mathcal{V},
+    \]
+    where \( \mathcal{V} \) is the vocabulary and the process is initialised by a prompt \( pr \).
+    At each step the model samples
+    \[
+      Y_{t+1} \sim \pi_\phi(\cdot \mid Y_{\le t}, pr),
+    \]
+    from a conditional policy parametrised by weights \( \phi \).
+    The <em>thinking phase</em> consists of the random evolution of this sequence until a termination token is produced,
+    at which point the model emits the response, starting from a random stopping time \( \tau \).
+    We denote by
+    \[
+      X := g(Y_{\tau:}) \in \mathcal{A}
+    \]
+    the canonicalised terminal answer, obtained by applying a deterministic extraction map \( g \).
+    The induced terminal distribution \( \mathbf{p} = \mathrm{Law}(X) \) over the answer set \( \mathcal{A} \) captures the model’s epistemic uncertainty about its own final output.
+    In an ideal reasoning model, we would like rollouts to exhibit rich variability in \( Y_{1:\tau-1} \) (the reasoning trajectories), yet concentrate mass in the final answer \( X \) (the outcome).
+    That is, we seek <em>diversity over reasoning paths, but consistency over terminal responses</em>.
+  </p>
+  <p>
+    In supervised or verifier-equipped settings, correctness can be externally validated.
+    In open-ended reasoning tasks, such supervision is unavailable.
+    In the absence of external rewards, a model must act relative to its own uncertainty.
+    Letting \( a \in \mathcal{A} \) denote the chosen output and \( X \sim \mathbf{p} \) the stochastic model response, the expected 0–1 loss is \( \mathbb{E}[1\{a \neq X\}] \).
+    The Bayes-optimal decision minimising this loss is the mode
+  </p>
+  <p>
+    \[
+      c^\star = \arg\max_j p_j,
+    \]
+  </p>
+  <p>
+    which corresponds to the model’s most probable self-consistent answer.
+    Hence, under symmetric loss, recovering the mode is the optimal <em>model-relative</em> prediction.
+    When a verifier is absent, certifying that a model’s reported answer coincides with this mode provides a natural measure of reliability.
+  </p>
+  </div>
+        <h3 class="title is-4">Statistical Certificates of Self-Consistency</h3>
+        <div class="content has-text-justified">
+        <p>
+          In practice, the terminal probabilities \( \mathbf{p} \) are unknown and can be estimated only through multiple
+          independent rollouts \( X_1,\ldots,X_n \).
+          The simplest estimator of the mode is the <em>majority vote</em>
+        </p>
+        <p>
+          \[
+            \widehat{c}_n := \arg\max_j \hat{p}_{n,j},
+            \qquad
+            \hat{p}_{n,j} = \frac{1}{n}\sum_{i=1}^{n}\mathbf{1}\{X_i=j\}.
+          \]
+        </p>
+        <p>
+          This estimator forms the basis of <em>self-consistency</em> test-time scaling.
+          From a statistical standpoint, majority voting is the Bayes-optimal estimator of \( c^\star \) under 0--1 loss,
+          and an associated upper bound on \( \mathbb{P}[\widehat{c}_n \neq c^\star] \) provides a
+          <em>statistical certificate of self-consistency</em>: a quantitative guarantee that the aggregated answer
+          coincides with the mode of the terminal law \( \mathbf{p} \) with high probability.
+        </p>
+        <p>
+          Under standard regularity conditions the majority-vote estimator is consistent, \( \Pr[\widehat{c}_n = c^\star] \to 1 \) as \( n \to \infty \).
+          <strong>A more practical question concerns the finite-sample regime: how large must \( n \) be to guarantee, with
+          confidence \( 1-\varepsilon \), that \( \widehat{c}_n \) already equals \( c^\star \)?</strong>
+        </p>
+        <p>
+          To address this, we derive finite-sample and asymptotic certificates, leveraging Hoeffding, Bernstein,
+          Chernoff–Markov, and Sanov concentration bounds for the error probability \( \mathbb{P}[\widehat{c}_n \neq c^\star] \).
+          These bounds clarify how reliability scales with the ensemble size and with the <em>mode margin</em>
+          \( \delta = p_{c^\star} - p_{j^\star} \), i.e., the gap between the top two answer probabilities.
+        </p>
+        <p>
+          If the probabilities \( p_j \) were known, one could invert these bounds to determine the number of samples required
+          to achieve a desired confidence \( 1-\varepsilon \).
+          In reality, both \( p_j \) and \( \delta \) must be estimated on the fly.
+          This motivates a <em>sequential</em> formulation: <strong>as rollouts arrive, can we determine adaptively when the current majority
+          is statistically reliable?</strong>
+          We introduce the <em>Martingale Majority Certificate (MMC)</em>, a sequential procedure that adaptively tests whether the empirical leader remains significantly ahead of its nearest rival and
+          of all others combined. This guarantees that at the (random) stopping time \( \tau \), majority vote coincides with the true mode with high probability:
+        </p>
+        <p>
+          \[
+            \Pr[\widehat{c}_{n_\tau} \neq c^\star] \le \varepsilon,
+          \]
+        </p>
+        <p>
+          thus providing an <em>anytime-valid certificate</em> of model self-consistency.
+        </p>
         </div>
+        <h3 class="title is-4">Martingale Majority Certificate Stopping Rule</h3>
+        <div class="content has-text-justified">
+          <p>
+            Our proposed stopping rule adaptively decides when to stop sampling rollouts while controlling the error of returning the empirical majority.
+        </p>
+        <p>
+    The central challenge in the LLM setting is the potentially large number of possible outcomes.
+    A naive stopping rule would require pairwise comparisons of the empirical probabilities across all classes
+    \( i \neq j \), \( i,j \in \{1, \dots, k\} \), which becomes computationally prohibitive as \( k \) grows.
+  </p>
+  <p>
+    To address this, we exploit the observation that the mass of the terminal law is typically concentrated on a few classes \( m \ll k \).
+    Thus, instead of considering all classes individually, we aggregate votes into three categories:
+    <ul>
+      <li>the current leader \( \widehat{c}_n \),</li>
+      <li>the runner-up</li>
+      <li>all the <em>others</em>.</li>
+    </ul>
+  </p>
+  <p>
+    Accordingly, we perform two tests: leader vs runner-up and leader vs <em>others</em>.
+  </p>
+        <div style="text-align:center; margin: 24px 0;">
+            <img src="mmc_algorithm.png" alt="MMC algorithm" width="70%">
+          </div>
         </div>
       </div>
     </div>
+  </div>
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-full-width">
+        <h3 class="title is-4">Optimising Sample Efficiency with Test-Time Training</h3>
         <div class="content has-text-justified">
           <p>
+            Our ultimate goal is to minimise the number of samples required from the LLM for the majority vote
+            to return the correct answer with high confidence \(1-\varepsilon\). The expected stopping time of the MMC scales approximately as
+<span id="eq-expected_number_samples">
+\[
+N \;\approx\;
+\frac{2(p_{\hat c}+p_{j^\star})}{(p_{\hat c}-p_{j^\star})^{2}} \,\log \frac{1}{\varepsilon},
+\]
+</span>
+so that small mode margins
+<span>\( \delta = p_{\hat c}-p_{j^\star} \)</span>
+lead to rapidly increasing sample requirements.
+</p>
+          <p>
+      <strong>The key question is whether test-time adaptation can reshape the terminal distribution to enlarge this margin, thereby improving sample efficiency.</strong>
+      </p>
+           <p>
+           We show that the optimal policy corresponding to the KL-regularised objective proposed in <a href="https://arxiv.org/pdf/2504.16084">TTRL</a> is an exponentially tilted version of the base model.
+           Decreasing the regularisation parameter consistently increases the margin and reduces the number of samples required for certification.
           </p>
+        <p><strong style="font-size: 1.3em;">Two new test-time RL objectives</strong></p>
+  <p>
+    We introduce two label-free group-level rewards designed to optimise the trade-off between sharpness
+    and bias. Let \( \mathbf{X} = (X_1, \dots, X_n) \) be a set of answers arising from rollouts
+    \( \mathbf{Y} =(Y_1, \ldots, Y_n) \) for a given prompt, with \( \widehat{c}_n \) denoting the majority vote
+    and \( j_n^\star \) the runner-up. Define \( N_j = \sum_i \mathbf{1}\{X_i=j\} \).
+  </p>
+  <ol class="objective-list">
+    <li>
+      <span class="objective-title">SNR-based reward.</span>
+      <p>
+        Directly leveraging the SNR as a driving factor in the efficiency of the MMC scheme we introduce the first reward
+      </p>
+      <p>
+        \[
+          r^{(1)}_n(\mathbf{Y})
+          = \widehat{\mathrm{SNR}}(\Delta_{j^\star_n})(\mathbf{X})
+          = \frac{(N_{\widehat c_n}-N_{j^\star_n})^{2}}
+                 {n \left(N_{\widehat c_n}+N_{j^\star_n}\right)
+                  -(N_{\widehat c_n}-N_{j^\star_n})^{2}}
+          \;\xrightarrow[n\to\infty]{}\;
+          \mathrm{SNR}(\Delta_{j^\star_n}).
+        \]
+      </p>
+      <p>
+        This objective aims to directly maximise \( \text{SNR}(\Delta_{j_n^\star}) \), which is equivalent to minimising the expected
+        number of samples required to obtain statistical certificates for the majority vote.
+      </p>
+    </li>
+    <li>
+      <span class="objective-title">Entropy-based reward.</span>
+      <p>
+        As we want to encourage a more peaked terminal distribution, another natural option is negative entropy, i.e.
+      </p>
+      <p>
+        \[
+          r^{(2)}_n(\mathbf{Y})
+          = \widehat H_n(\mathbf{X})
+          = \sum_{j:N_j>0}\frac{N_j}{n} \log \frac{N_j}{n}
+          \;\xrightarrow[n\to\infty]{}\;
+          \sum_j p_j \log p_j = -H(p).
+        \]
+      </p>
+      <p>
+        Maximising \( \widehat H_n \) <em>minimises</em> the Shannon entropy of the answer
+        distribution, encouraging a sharper, lower-entropy terminal distribution.
+        🚨<strong>Important:</strong> The tempering sharpens only the distribution of final answers, not the full sequence distribution.
+        This gives us the best of both worlds:  promoting certainty when providing a final answer, but permitting exploration of diverse
+        pathways during the chain-of-thought reasoning process.
+      </p>
+    </li>
+  </ol>
+          <div style="text-align:center; margin: 24px 0;">
+            <img src="ttt_performance_math500.png" alt="Performance TTT" width="100%">
+            <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
+              Pass@1 performance after test-time training with SNR and entropy-based rewards relative to the base models.
+            </figcaption>
           </div>
           <p>
+            We observe in the table below that the number of samples required under the MMC stopping rule decreases after applying test-time training, relative to the pre-trained model.
+            That is, test-time training sharpens the terminal answer distribution, increasing the mode margin and thus reducing the number of samples required for certification.
           </p>
+          <div style="text-align:center; margin: 24px 0;">
+            <img src="table_mmc.png" alt="Performance TTT" width="75%">
+            <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
+              Majority vote accuracy and required number of samples under the MMC stopping rule (✅) at confidence levels 0.1 and 0.4 for the pre-trained model and after test-time training with SNR-based rewards. Performance is compared to that obtained using the full sample budget (❌).
+            </figcaption>
+          </div>
+          </ul>
         </div>
       </div>
     </div>
+  </div>
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
     <div class="columns is-centered">
       <div class="column is-full-width">
+        <h3 class="title is-4">SNR as a label-free estimator of task difficulty</h3>
         <div class="content has-text-justified">
           <p>
+          Our experiments reveal a notable empirical regularity: the
+          <em>signal-to-noise ratio</em> (SNR) of the margin variable
+          \(\Delta_{j^\star} = \mathbf 1\{X = c^\star\} - \mathbf 1\{X = j^\star\}\),
+          which quantifies the sharpness of the model’s terminal answer distribution,
+          correlates strongly with external measures of problem difficulty.
+          Across the MATH-500 benchmark, harder problems exhibit systematically lower and more variable SNR values,
+          while easier problems yield sharply peaked distributions concentrated around a single answer.
           </p>
           <p>
+          This behaviour is non-trivial: the model has no access to ground-truth difficulty labels, yet its own epistemic
+          uncertainty, reflected in the variability of its rollouts, aligns closely with these labels.
+          <strong>This suggests an emergent form of calibration in reasoning LLMs</strong>:
+          without explicit supervision or external verification, models appear to ''know when they do not know.''
+          In statistical terms, the SNR acts as a label-free proxy for epistemic uncertainty and, consequently, for task difficulty.
           </p>
+          <div style="text-align:center; margin: 24px 0;">
+            <img src="QWEN-MATH-1.5B_violin_maj100_SNR.png" alt="SNR distribution qwen-math-1.5B." style="width: 48%;margin-right: 1%;">
+            <img src="QWEN-MATH-7B_violin_maj100_SNR.png" alt="SNR distribution qwen-math-7B." style="width: 48%;margin-left: 1%;">
+            <figcaption style="color:#6b7280; font-size: 0.9rem; margin-top: 8px;">
+              Distribution of the estimated SNR when using MMC stopping rule with \(\varepsilon = 0.1\) and \(N_{\text{budget}}=100\). Results are obtained after applying test-time training with SNR-based rewards.</figcaption>
+          </div>
+        </div>
+      </div>
+</div>
+  </div>
+</section>
+<section class="section">
+  <div class="container is-max-desktop">
+    <div class="columns is-centered">
+      <div class="column is-full-width">
+        <h3 class="title is-4">Conclusion</h3>
+        <div class="content has-text-justified">
           <p>
+            <strong>Our results unify several strands of recent work on reliable inference in LLMs, self-consistency,
+            adaptive compute allocation, and test-time reinforcement learning (TTRL), under a common
+            statistical perspective.</strong>  Through this lens, majority voting emerges naturally as a means of estimating the mode of the terminal distribution.
+            The validity of the majority vote as an estimate of the mode can be certified by finite-sample and asymptotic bounds. The Martingale Majority Certificate (MMC)
+            extends this view by providing an operational test-time algorithm that determines, from model
+            rollouts alone, when a response is statistically self-consistent.
           </p>
           <p>
+          Furthermore, <strong>we shed light on the underlying mechanism by which TTRL and related post-training
+          approaches improve reasoning reliability: KL-regularised optimisation corresponds to an
+          exponential tilting of the terminal law, sharpening it around its mode and increasing the
+          signal-to-noise ratio (SNR) of the margin variable.</strong>  This insight explains empirical observations of
+          enhanced consistency after test-time adaptation, and motivates new label-free objectives such as
+          our SNR- and entropy-based rewards, which explicitly target this trade-off between sharpness and
+          bias.  Unlike prior work that tunes temperature or per-token distributions, our formulation operates
+          on the terminal marginal, preserving exploration during reasoning while promoting confidence in the
+          final answer.
           </p>
         </div>
       </div>
     </div>
   </div>
 </section>
 <section class="section" id="BibTeX">
   <div class="container is-max-desktop content">
     <h2 class="title">BibTeX</h2>
+    <pre><code>@article{corderoencinar2025certified,
+  author    = {Paula Cordero-Encinar and Andrew B. Duncan},
+  title     = {Certified Self-Consistency: Statistical Guarantees and Test-Time Training for Reliable Reasoning in LLMs},
+  journal   = {arXiv:2510.17472},
+  year      = {2025},
 }</code></pre>
   </div>
 </section>
 <footer class="footer">
   <div class="container">
     <div class="content has-text-centered">
+      <a class="icon-link" href="https://arxiv.org/pdf/2510.17472" class="external-link">
         <i class="fas fa-file-pdf"></i>
       </a>
+      <a class="icon-link" href="https://github.com/paulaoak/certified_self_consistency" class="external-link">
         <i class="fab fa-github"></i>
       </a>
     </div>
       <div class="column is-8">
         <div class="content">
           <p>
+            This website template is borrowed from <a href="https://nerfies.github.io/">Nerfies</a>,
+            licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
             Commons Attribution-ShareAlike 4.0 International License</a>.
           </p>
         </div>
       </div>
     </div>