From 99ef1497aab71f21b6b2565248fce7cb901265ac Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@ucc.asn.au>
Date: Tue, 4 Oct 2022 23:43:36 +0800
Subject: [PATCH] Add asm to capture samples 26x more precisely

---
 examples/raw.rs |   9 +--
 src/cap.rs      | 147 ++++++++++++++++++++++++++++++++++++++++++++----
 src/rng.rs      |  11 +++-
 3 files changed, 150 insertions(+), 17 deletions(-)

diff --git a/examples/raw.rs b/examples/raw.rs
index 7478e53..3eac6e8 100644
--- a/examples/raw.rs
+++ b/examples/raw.rs
@@ -23,8 +23,9 @@ async fn main(_spawner: Spawner) {
     let mut cp = cortex_m::peripheral::Peripherals::take().unwrap();
 
     let mut gpio = (Flex::new(p.PIN_10), 10);
-    loop {
-        caprand::cap_rand(&mut gpio.0, gpio.1, &mut cp.SYST, 10_000,
-            |v| info!("{}", v)).unwrap();
-    }
+    caprand::cap_rand(&mut gpio.0, gpio.1, &mut cp.SYST,
+        |v, overshoot| {
+            info!("{} {}", v, overshoot);
+            true
+    }).unwrap();
 }
diff --git a/src/cap.rs b/src/cap.rs
index 4976de2..dccb433 100644
--- a/src/cap.rs
+++ b/src/cap.rs
@@ -4,6 +4,8 @@ use log::{debug, info, warn, error};
 #[cfg(feature = "defmt")]
 use defmt::{error, debug, info, panic};
 
+use core::arch::asm;
+
 use cortex_m::peripheral::SYST;
 use embassy_rp::gpio::{Flex, Pin, Pull};
 use embassy_rp::pac;
@@ -61,7 +63,7 @@ GPIO13
 // on reading.
 const LOW_OVER: u32 = 1000;
 // Power of two for faster modulo
-const HIGH_OVER: u32 = LOW_OVER + 16384;
+const HIGH_OVER: u32 = LOW_OVER + 1024;
 
 // Assume worst case from rp2040 datasheet.
 // 3.3v vdd, 2v logical high voltage, 50kohm pullup, 0.01uF capacitor, 125Mhz clock.
@@ -100,16 +102,127 @@ impl<'t> SyTi<'t> {
     }
 }
 
+
+/// Equivalent to
+/// `while gpioN.is_high() {}`
+/// but with known 4 cycle loop time (vs 26 cycles for the `while` loop at
+/// at time of writing)
+fn time_fall(pin_num: usize, syst: &mut SYST) -> Result<u32, ()> {
+    // bank 0 single cycle IO in
+    let gpio_in = pac::SIO.gpio_in(0).ptr();
+    let mask = 1u32 << pin_num;
+    let t = SyTi::new(syst);
+    unsafe {
+        asm!(
+            "222:",
+            // read gpio_in register, 1 cycle
+            "ldr {tmp}, [{gpio_in}]",
+            // AND with the desired pin bit, 1 cycle
+            "ands {tmp}, {mask}",
+            // loop if bit set, 2 cycles
+            "bne 222b",
+            tmp = out(reg) _,
+            mask = in(reg) mask,
+            gpio_in = in(reg) gpio_in,
+            options(nostack, readonly),
+        );
+    }
+    t.done()
+}
+
+// Returns (ticks: u32, precise: bool)
+fn time_fall_unroll(pin_num: usize, syst: &mut SYST) -> Result<(u32, bool), ()> {
+    // bank 0 single cycle IO in
+    let gpio_in = pac::SIO.gpio_in(0).ptr();
+    let mask = 1u32 << pin_num;
+    let t = SyTi::new(syst);
+    let dbg: u32;
+    let x0: u32;
+    let x1: u32;
+    let x2: u32;
+    let x3: u32;
+    let x4: u32;
+    let x5: u32;
+    let x6: u32;
+    let x7: u32;
+    let dbg: u32;
+
+
+    // let so = pac::SIO.gpio_out(0);
+    // let mut out = Flex::new(unsafe { embassy_rp::peripherals::PIN_16::steal() });
+    // out.set_high();
+    // out.set_as_output();
+
+    unsafe {
+        asm!(
+            "222:",
+            "mov r10, r7",
+            // read gpio_in register, 1 cycle
+            "ldr {x0}, [{gpio_in}]",
+            "ldr {x1}, [{gpio_in}]",
+            "ldr {x2}, [{gpio_in}]",
+            "ldr {x3}, [{gpio_in}]",
+            "ldr {x4}, [{gpio_in}]",
+            "ldr {x4}, [{gpio_in}]",
+
+            // Only test the most recent sample. there's a slight chance
+            // of missing a glitchy short trigger in an earlier sample,
+            // but we'll catch up later if we miss it.
+            "ands {x4}, {mask}",
+            // Loop if bit set, 2 cycles
+            "bne 222b",
+            "mov r7, r10",
+
+            mask = in(reg) mask,
+            gpio_in = in(reg) gpio_in,
+            x0 = out(reg) x0,
+            x1 = out(reg) x1,
+            x2 = out(reg) x2,
+            x3 = out(reg) x3,
+            x4 = out(reg) x4,
+            out("r10") _,
+            options(nostack, readonly),
+        );
+    }
+
+    let tick = t.done()?;
+    // unsafe {
+    //     so.value_clr().write_value(1<<16);
+    //     so.value_set().write_value(1<<16);
+    //     so.value_clr().write_value(1<<16);
+    //     // out.set_low();
+    //     // out.set_high();
+    //     // out.set_low();
+    // }
+
+    let pos = if x0 & mask == 0 {
+        0
+    } else if x1 & mask == 0 {
+        1
+    } else if x2 & mask == 0 {
+        2
+    } else if x3 & mask == 0 {
+        3
+    } else if x4 & mask == 0 {
+        4
+    } else {
+        5
+    };
+
+    // TODO: pos=0 output should be discarded
+    // debug!("{} {} {} {} pos {}", x0, x1, x2, x3, pos);
+    Ok((tick + pos, pos != 0))
+}
+
 // `f()` is called on each output `u32`.
 pub fn cap_rand<'d, P: Pin, F>(
     pin: &mut Flex<'d, P>,
     pin_num: usize,
     syst: &mut SYST,
-    n_out: usize,
     mut f: F,
 ) -> Result<(), ()>
 where
-    F: FnMut(u32),
+    F: FnMut(u32, u32) -> bool,
 {
     syst.set_clock_source(cortex_m::peripheral::syst::SystClkSource::Core);
     // prescribed sequence for setup
@@ -152,14 +265,14 @@ where
     // The main loop
     let mut overshoot = 1u32;
 
+    let mut warming = WARMUP;
+
     // After warmup we sample twice at each "overshoot" value.
     // One sample is returned as random output, the other is mixed
     // in to the overshoot value.
-    let n_iter = WARMUP + 2 * n_out;
+    for (i, _) in core::iter::repeat(()).enumerate() {
 
-    for i in 0..n_iter {
-        // Pull up until hit logical high
-        let meas = critical_section::with(|_cs| {
+        let (meas, precise) = critical_section::with(|_cs| {
             pin.set_pull(Pull::Up);
             while pin.is_low() {}
             // Keep pulling up for `overshoot` cycles
@@ -167,14 +280,24 @@ where
 
             // Pull down, time how long to reach threshold
             pin.set_pull(Pull::Down);
-            let t = SyTi::new(syst);
-            while pin.is_high() {}
-            t.done()
+            // let t = SyTi::new(syst);
+            // while pin.is_high() {}
+            // t.done()
+            // time_fall(pin_num, syst)
+            let r = time_fall_unroll(pin_num, syst);
+            pin.set_pull(Pull::None);
+            r
         })?;
 
-        if i > WARMUP && (i - WARMUP) % 2 == 0 {
+        if i % 2 == 0 {
             // real output
-            f(meas)
+            if precise && warming == 0 {
+                if !f(meas, overshoot) {
+                    // no more output wanted
+                    break
+                }
+            }
+            warming = warming.saturating_sub(1);
         } else {
             // don't produce output, mix measured sample in
             overshoot = overshoot * 2 + meas;
diff --git a/src/rng.rs b/src/rng.rs
index 02bab82..ba2a3e8 100644
--- a/src/rng.rs
+++ b/src/rng.rs
@@ -75,7 +75,12 @@ impl CapRng {
         syst: &mut SYST,
     ) -> Result<Self, getrandom::Error> {
         let mut h = Sha256::new();
-        crate::cap::cap_rand(pin, pin_num, syst, Self::SEED_SAMPLES, |v| h.update(v.to_be_bytes())).map_err(
+        let mut count = 0;
+        crate::cap::cap_rand(pin, pin_num, syst, |v, _over| {
+            h.update(v.to_be_bytes());
+            count += 1;
+            count < Self::SEED_SAMPLES
+        }).map_err(
             |_| {
                 warn!("Random generation failed");
                 error()
@@ -85,3 +90,7 @@ impl CapRng {
         Ok(Self(ChaCha20Rng::from_seed(seed)))
     }
 }
+
+
+// tests:
+// - f() is called the correct number of times, be exhaustive?
-- 
GitLab