/*
 * This version incorporates a simple assembler for the DMAcpu code 
 * 
 * VGA by Hunter Adams (vha3@cornell.edu)
 * converted to 320x240 with 256 colors by Bruce; bruce.land@cornell.edu
 * 
 * HARDWARE CONNECTIONS
 *  - GPIO 16 ---> VGA Hsync
 *  - GPIO 17 ---> VGA Vsync
 * 
 *  - GPIO 08 ---> 330 ohm resistor ---> VGA Blue lo-bit |__ both wired to 150 ohm to ground 
 *  - GPIO 09 ---> 220 ohm resistor ---> VGA Blue hi_bit |   and to VGA blue
 * 
 *  - GPIO 10 ---> 1000 ohm resistor ---> VGA Green lo-bit |__ three wired to 100 ohm to ground
 *  - GPIO 11 ---> 680 ohm resistor ---> VGA Green mid_bit |   and to VGA Green
 *  - GPIO 12 ---> 330 ohm resistor ---> VGA Green hi_bit  |   
 * 
 *  - GPIO 13 ---> 1000 ohm resistor ---> VGA Red lo-bit |__ three wired to 100 ohm to ground
 *  - GPIO 14 ---> 680 ohm resistor ---> VGA Red mid_bit |   and to VGA red
 *  - GPIO 15 ---> 330 ohm resistor ---> VGA Red hi_bit  |   
 * 
 *  - RP2040 GND ---> VGA GND
 *
 * RESOURCES USED
 *  - PIO state machines 0 to 3 on PIO instance 0
 *  - DMA channels 0, 1, 2, 3 data send to two PIO
 *  - 76.8 kBytes of RAM (for pixel color data)
 * color encoding: bits 7:5 red; 4:2 green; 1:0 blue
 *
 * Protothreads v1.1.1
 * graphics demo thread
 * serial thread to set DMAcpu test values
 * the usual blinky thread for a hearbeat
 */
// ==========================================
// === VGA graphics library
// ==========================================
#include "vga256_graphics.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h> 
#include <math.h>
#include "pico/stdlib.h"
#include "hardware/pio.h"
#include "hardware/dma.h"

//===========================================
// === DMAcpu
//===========================================
#include "hardware/gpio.h"
#include "hardware/timer.h"
#include "hardware/spi.h"
#include "pico/stdlib.h"
#include "hardware/uart.h"
#include "hardware/gpio.h"
#include "hardware/structs/iobank0.h"
#include "hardware/regs/rosc.h"

// ==========================================
// === protothreads globals
// ==========================================
#include "hardware/sync.h"
#include "hardware/timer.h"
#include "pico/multicore.h"
#include "string.h"
// protothreads header
#include "pt_cornell_rp2040_v1_1_1.h"
// interactive color
int new_value = true ;

// ==========================================
// === DMA machine setup
// ==========================================
// === global thread + DMA communicaiton
// must be volatile because of DMA modification
// a zero load variable -- dont change
volatile int dma_zero = 0 ;
// a one-load variable -- dont change
volatile int dma_one = 1 ;
volatile int dma_neg_one = -1 ;

// ========================================
// === spi setup == not used here
// =======================================
//SPI configurations
#define PIN_CS   5
#define PIN_SCK  6
#define PIN_MOSI 7
#define SPI_PORT spi0

// constant to tell SPI DAC what to do
// prepend to each 12-bit sample
#define DAC_config_chan_A  0b0011000000000000 

// noise state
volatile int rand_seed, rand_sample ;
volatile uint32_t *rnd_reg = (uint32_t *)(ROSC_BASE + ROSC_RANDOMBIT_OFFSET);
// amp is 0-16
volatile int rand_amp = 4 ;

// DDS globals
unsigned int dds_accum = 0, dds_inc = 400 * 4294967296 / 50000 ;
//mask to clear sniffer bits
unsigned int clear_high_bytes = 0xffffff00 ;
//
float frequency = 400, Fs = 4e4;
//
int sine_table[256] ;
int * sine_table_addr = &sine_table[0] ;
int sine_sample;
//
// times are negative to make subtraction easier
// rise and fall are set at 256 sample times.
int repeat_time = -10000;
int duration = -2000 ;
int current_time = 0 ;
int rise_time = -256 ;
int fall_time = -1744; //repeat_time + rise_time ;
//
int current_amp, max_amp=256 ;

// ========================================
// === DMAcpu setup  
// =======================================
//  DMA control block size in words:
#define length_of_block 4
// the main DMA block program list
#define max_blocks 200
int DMA_blocks[max_blocks * length_of_block];
// and the addresses for each block
int * block_addr_array[max_blocks] ;
int * link_addr ;

// counter for the current block to create
int N = 0;
// /dev/null - a data sink
int bit_bucket ;
// registgers for intermediate results
int sniff_temp1, sniff_temp2 ;

// make it easier to change channels
// so that the machine will work with other software, like video
#define fetch_chan 9
#define execute_chan 10
#define fix_chan 11

// address of current block
#define current_block_addr (&DMA_blocks[4*(N)])
// address of the next block
#define next_block_addr (&DMA_blocks[4*(N+1)])
// address of the  block 2 ahead
#define next_block2_addr (&DMA_blocks[4*(N+2)])
// address of the  block 3 ahead
#define next_block3_addr (&DMA_blocks[4*(N+3)])
// adderss of arbitrary block
#define block_addr(N)    (&DMA_blocks[4*(N)])

// macro to build a DMA block
// inputs: read addr, write addr, count, ctrl word (4 bytes each)
// puts a 16 byte channel control block image in the DMA_blocks array at the current block count, N, 
// then increments the block count N
// read_addr is put in array location N*length_of_block,
// write address is put in array location N*length_of_block+1,
#define build_block(read_addr, write_addr, count, ctrl) \
    do { \
        DMA_blocks[4*N] = (int)read_addr; \
        DMA_blocks[4*N+1] = (int)write_addr; \
        DMA_blocks[4*N+2] = count; \
        DMA_blocks[4*N+3] = ctrl; \
        N++ ; \
    } while(0)

// macros for defining the DMA CNTL word bits
// --This duplicates some of the SDK --
// but i like having it here
// data_width is 0==byte 1==short 2==int
#define DMA_DATA_WIDTH(data_width) ((data_width & 0x03)<<2) 
// give this channel more access if several channels aare on
#define DMA_HIGH_PRI  (1<<1) 
// turn on the channel
#define DMA_EN  1  
// increment or keep constant wrrite nd read addr
// useful for peripheril write/read
#define DMA_WR_INC  (1<<5) 
#define DMA_RD_INC  (1<<4) 
#define array_read  DMA_RD_INC
#define array_write DMA_WR_INC
#define var_read  0
#define var_write 0
// enable this channel for sniffer
#define SNIFF_EN (1<<23)
// reverse order of bytes when channel is transmitting
#define BSWAP (1<<22)
//turn off channel done IRQ
#define DMA_IRQ_QUIET  (1<<21)
// More tirgger info page 114
// 0x0 to 0x3a == select DREQ n as TREQ from table above
// 0x3b == Select Timer 0 as TREQ
// 0x3c == Select Timer 1 as TREQ
// 0x3d == Select Timer 2 as TREQ (Optional)
// 0x3e == Select Timer 3 as TREQ (Optional)
// 0x3f == Permanent request, for unpaced transfers.
// bits 15:20 trigger request source
#define DMA_TREQ(trigger_source)  ((trigger_source & 0x3f)<<15) 
// When this channel completes, it will trigger the channel
// indicated by CHAIN_TO. Disable by setting CHAIN_TO = (this channel).
// bits 11:14 next chnnel #
#define DMA_CHAIN_TO(next_ch) ((next_ch & 0x0f)<<11) 
#define STANDARD_CTRL (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN)

// DMA sniffer atomic write operations 
// used to simulate an accumulator logic operations
// sniffer can also compute add, CRC, parity, and bit-inversion
#define sniffer_add 0x0f
#define sniffer_crc32 0x00
#define DMA_SNIFF_DATA_SET   (0x2438 + 0x50000000)
#define DMA_SNIFF_DATA_XOR   (0x1438 + 0x50000000)  
#define DMA_SNIFF_DATA_CLR   (0x3438 + 0x50000000)
#define DMA_SNIFF_CTRL_SET   (0x2434 + 0x50000000)
#define DMA_SNIFF_CTRL_XOR   (0x1434 + 0x50000000)  
#define DMA_SNIFF_CTRL_CLR   (0x3434 + 0x50000000)
// logic:
// load A then set using B as mask implements A OR  B
// load B then CLR bits using NOT(A) as a mask implements A AND B
// load A then XOR bits using B as mask implements A XOR B
// load A then XOR bits with 0xffffffff implements NOT A

// default will be add -- BUT function can be controlled by DMA machine
// these two sniffer congtrol options invert or reveses the bits when WRITING sniffer to 
// some location
#define OUT_INV (1<<11)
#define OUT_REV (1<<10)
// calc field is 4 bits 5:8: all bits set is add, zero is CRC32
#define CALC_ADD (0xf<<5)
#define CALC_CRC (0x0<<5)
int sniff_inv_mask = OUT_INV ;
int sniff_rev_mask = OUT_REV ;
int sniff_calc_mask = 0xf << 5 ;
// invert all bits when using DMA_SNIFF_DATA_XOR
int sniff_xor_inv_mask = 0xffffffff ;
// clear all but a few bits to generate offsets for jumping
// after a BSWAP puts the sign bits into bits 7:4
int sniff_offset16_mask = 0xffffffef ;
int sniff_offset32_mask = 0xffffffdf ;
int sniff_offset48_mask = 0xffffffcf ;
int sniff_dac_data_mask = 0xfffff000 ;
int sniff_dac_data_mask2 = 0xfffff800 ;
int sniff_sign_mask =     0xfffffffe ;
int dac_config_mask = DAC_config_chan_A;

// define gpio2 direct write registers
int pin_hi = 0x3300 ; // sio_hw->gpio_set = pin_on
int pin_lo = 0x3200 ; // sio_hw->gpio_clr = pin_off; 0x3200

// The address of the execute channel read
int DMA_execute_addr = (DMA_BASE + execute_chan * DMA_CH1_READ_ADDR_OFFSET) ;
// and point to it
int * DMA_execute_addr_ptr = &DMA_execute_addr ;

// base address of DMA block list being defined
// every unique jump target must have a named address like this
 int * DMA_start_addr = &DMA_blocks[0] ;

// =======================================================
// set up the fetch/execute, to start the machine
// set machine speed (can be user modified)
// dont mess with this routine, except for timer setting
// =======================================================
void DMA_machine_start(void) {
    // initialize a list of bolck addresses for jumping targets
    for(int i=0; i<max_blocks; i++ ){
        block_addr_array[i]  = &DMA_blocks[4*(i)] ;
    }
    // ======================================================
    // execution machine -- fetch/execute state machine
    // it is very unlikely that you should modify this
    // ======================================================
    // === fix module to reset write address of fetch channel
    // always set to  execute channel write address
    dma_channel_config c2 = dma_channel_get_default_config(fix_chan);
    channel_config_set_transfer_data_size(&c2, DMA_SIZE_32);
    channel_config_set_read_increment(&c2, false);
    channel_config_set_write_increment(&c2, false);
    channel_config_set_irq_quiet(&c2, true);
    channel_config_set_enable(&c2, true); 
    channel_config_set_chain_to(&c2, fetch_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c2,  DREQ_FORCE);
    //
    dma_channel_configure(fix_chan, &c2, 
        &dma_hw->ch[fetch_chan].write_addr , // reset the fetch  to write to execute channel 
        DMA_execute_addr_ptr ,  // read_addr, pointer to address of execute channel
        1, // one words per DMA block
        false) ; // triggered to start machine running

    // === The fetch module
    // this is the program counter and fetch unit
    // when this code is executed it starts the DMAcpu machine!
    // NOTE that this moddule could be paced by a timer or other TREQ 
    //    for DDS, or other operations
    dma_channel_config c0 = dma_channel_get_default_config(fetch_chan);
    channel_config_set_transfer_data_size(&c0, DMA_SIZE_32);
    channel_config_set_read_increment(&c0, true);
    channel_config_set_write_increment(&c0, true);
    channel_config_set_irq_quiet(&c0, true);
    channel_config_set_enable(&c0, true); 
    channel_config_set_chain_to(&c0, execute_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c0,  DREQ_FORCE);
    //
    dma_channel_configure(fetch_chan, &c0, 
        &dma_hw->ch[execute_chan].read_addr , // write to dma channel 1
        DMA_start_addr ,  // read_addr, start of DMA blocks list
        4, // four words per DMA block
        true) ; // triggered to start machine running
}

// ===============================================================
// === Macros to make DMA programing seem more like assembler ===
// ===============================================================
// produce output at a gpio pin
// 'state' must evaluate to a valid gpio contgrol word
#define gpio_out(gpio_num, state) build_block(&state,  &iobank0_hw->io[gpio_num].ctrl, 1, STANDARD_CTRL);

// =============================
#define nop() build_block(&bit_bucket, &bit_bucket,  1, STANDARD_CTRL);
            
// =============================
// load and store sniff_data (the DMA ALU accumulator)
// load a variable to sniff_data
#define load_sniff(var_name) build_block(&var_name, &dma_hw->sniff_data, 1, STANDARD_CTRL) ;
// store sniff_data
#define store_sniff(var_name) build_block(&dma_hw->sniff_data, &var_name, 1, STANDARD_CTRL) ;

// =============================
// store sniff_data with bit-reversal
// this is fairly special purpose and is used infrequently
#define store_sniff_bitrev(var_name) do{ \
    build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);\
    build_block(&dma_hw->sniff_data, &var_name, 1, STANDARD_CTRL) ;\
    build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);\
} while(0) ;

// =============================
// move data from memory to memory.  This is pretty much just a raw DMA move
// 'count' is number of moves to make.
// 'width' must be DMA_SIZE_32 or DMA_SIZE_16 or DMA_SIZE_8
// if you are moving a whole array the 'var_name' should be 'array_name[0]'
// source_array, dest_array must have values var_read. var_write (for scalar) or array_read, array_write
#define move(source, destination, count, width,  source_array, dest_array) build_block(&source, &destination, count, \
            (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(width) | DMA_IRQ_QUIET | DMA_EN | source_array | dest_array));
// move and reverse byte order -- dependent on width!
#define move_bswap(source, destination, count, width) build_block(&source, &destination, count, \
            (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(width) | DMA_IRQ_QUIET | DMA_EN | BSWAP));

// =============================
// set sniffer functions
// set sniffer to CRC32 function
#define set_sniff_CRC32() build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);
// set sniff to add function
#define set_sniff_add() build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);

// =============================
// funciton is determined by most recent set function
// do the actual CRC32 
#define crc32_sniff(var_name,count) build_block(&var_name, &bit_bucket, count, STANDARD_CTRL | SNIFF_EN) ;
// add one or several times from one var or an array must have value var_read or array_read
#define add_sniff(var_name, count, array) build_block(&var_name, &bit_bucket, count, STANDARD_CTRL | SNIFF_EN | array) ;

// =============================
// 2's complement negative
// compute A-B as negate B and add A
#define neg_sniff() do{ \
    build_block(&sniff_xor_inv_mask,  DMA_SNIFF_DATA_XOR, 1, STANDARD_CTRL);\
    build_block(&dma_one,  &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN);\
} while(0)

// =============================
// direct bit manipulation of sniff data
// mask must be an int variable containing a 32-bit value
#define clr_sniff(mask) build_block(&mask,  DMA_SNIFF_DATA_CLR, 1, STANDARD_CTRL);
#define set_sniff(mask) build_block(&mask,  DMA_SNIFF_DATA_SET, 1, STANDARD_CTRL);
// computes XOR of sniff_data with mask
#define xor_sniff(mask) build_block(&mask,  DMA_SNIFF_DATA_XOR, 1, STANDARD_CTRL);

// =============================
// logical inv
#define inv_sniff() xor_sniff(sniff_xor_inv_mask) ;

// =============================
// -OR- var_name into sniff
#define or_sniff(var_name) set_sniff(var_name) ;

// =============================
// use DeMorgans law (a AND b) = NOT(NOT(a) OR NOT(b))
// invert the value in the sniff_data reg, then store it temporarially
// load the other variable and invert it
// -or- in the temp, then invert
#define and_sniff(var_name) do{\
    inv_sniff();\
    store_sniff(sniff_temp1);\
    load_sniff(var_name);\
    inv_sniff();\
    or_sniff(sniff_temp1); \
    inv_sniff();\
} while(0);

// =============================
// shift left
#define shift_left_sniff() build_block(&dma_hw->sniff_data, &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN) ;

// =============================
// logiccal shift right
// bitreverse, shift left, bitreverse
// just plain ugly code.
// tirgger a bitreverse by storing, then load it back, double it (shift left)
// store again with bitreverse to restore original order, then reload
#define logical_shift_right_sniff() do{\
    store_sniff_bitrev(sniff_temp1) ;\
    load_sniff(sniff_temp1);\
    add_sniff(sniff_temp1, 1, var_read); \
    store_sniff_bitrev(sniff_temp1);\
    load_sniff(sniff_temp1);\
} while(0) ;

#define logical_shift_right_4_sniff() do{\
    store_sniff_bitrev(sniff_temp1) ;\
    load_sniff(sniff_temp1);\
    add_sniff(sniff_temp1, 15, var_read); \
    store_sniff_bitrev(sniff_temp1);\
    load_sniff(sniff_temp1);\
} while(0) ;

#define logical_shift_right_8_sniff() do{\
    store_sniff_bitrev(sniff_temp1) ;\
    load_sniff(sniff_temp1);\
    add_sniff(sniff_temp1, 255, var_read); \
    store_sniff_bitrev(sniff_temp1);\
    load_sniff(sniff_temp1);\
} while(0) ;

// =============================
// arithmetic shift right
// even uglier code
// do the above, but -or- in only bit0 (the sign bit of the reversed number)
#define arith_shift_right_sniff() do{\
    store_sniff_bitrev(sniff_temp1) ;\
    load_sniff(sniff_temp1);\
    add_sniff(sniff_temp1, 1, var_read); \
    store_sniff(sniff_temp2); \
    load_sniff(sniff_temp1);\
    clr_sniff(sniff_sign_mask); \
    or_sniff(sniff_temp2);\
    store_sniff_bitrev(sniff_temp1);\
    load_sniff(sniff_temp1);\
} while(0) ;

// =============================
// multiply ONLY by relative small, CONSTANT, positive ints. e.g. 4 or 20, but NOT 1000000
// and NOT 0!
// the mult is done by interative adds!
#define mult_sniff(constant) do{\
    store_sniff(sniff_temp1); \
    add_sniff(sniff_temp1, constant-1, var_read);\
} while(0) ;

// =============================
// multiply small positive ints. e.g.0,  4 or 20, but NOT 1000000
// the mult is done by interative adds!
// the build_block modifies the count in the  block 2 steps later
// thee is some fancy footwork to make sure that var_name=0 does not crash
// the macro
#define mult_sniff_var(var_name) do{\
    store_sniff(sniff_temp1); \
    neg_sniff(); \
    store_sniff(sniff_temp2);\
    load_sniff(var_name);\
    add_sniff(dma_one,1,var_read);\
    build_block(&dma_hw->sniff_data, next_block2_addr+8, 1, STANDARD_CTRL); \
    load_sniff(sniff_temp2);\
    add_sniff(sniff_temp1, 1, var_read);\
} while(0) ;

// =============================
// unconditional jump
// 'block_addr' must be an int* variable assigned to an address
#define jump(block_addr) build_block(&block_addr, &dma_hw->ch[fetch_chan].read_addr, 1, STANDARD_CTRL);

// =============================
// conditional jump negative
// jump to two diferent locations on EITHER negative or positive/zero
// the argument adressses MUST be int* variables with block addresses
// the value compared is in sniff_data which is byte revsered to put the sign bit in bit7
// then masked to either 16 or zero, depending on the sign
#define jump_neg(block_addr_neg, block_addr_not_neg) do{\
    store_sniff(sniff_temp1) ; \
    build_block(&sniff_temp1,  &dma_hw->sniff_data, 1, STANDARD_CTRL | BSWAP); \
    clr_sniff(sniff_offset16_mask) ; \
    add_sniff(block_addr_array[N+2], 1, var_read)\
    build_block(&dma_hw->sniff_data, &dma_hw->ch[fetch_chan].read_addr, 1, STANDARD_CTRL);\
    jump(block_addr_not_neg) ; \
    jump(block_addr_neg) ; \
} while(0) ; 

// =============================
// jump and link for subroutine 
#define jump_link(function_addr) do{ \
    move(block_addr_array[N+2], link_addr, 1, DMA_SIZE_32, var_read, var_write) ; \
    jump(function_addr) ; \
}  while(0) ;

#define jump_return() do{ \
    jump(link_addr) ; \
} while(0) ;

// =============================
// clock pacer
// assumes that one of the pace timers has been set up
#define pacer(timer_dreq) build_block(&bit_bucket, &bit_bucket,  1, \
            DMA_CHAIN_TO(fix_chan) | DMA_TREQ(timer_dreq) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN);

// =============================
// label-define macro
#define label(label_name) do{label_name = current_block_addr;} while(0);

// =============================
// pointer to value
// this requires a copy from one blcok to the next sourece field
// note that 'sniff_temp1' is just a placeholder for the pointer
#define ptr_to_value_sniff() do{\
    build_block(&dma_hw->sniff_data, next_block_addr, 1, STANDARD_CTRL); \
    move(sniff_temp1, dma_hw->sniff_data, 1, DMA_SIZE_32, var_read, var_write); \
} while(0) ;

// ================================================
// === User written DMA program
// ================================================
// every unique jump target must have a named address like this
// program start:
int * DMA_pgm_addr ;
// the next two are used by the conditional jump test:
int * pin_label_addr; 
int * extra_pulse_addr; 
// test jump and link function call
int * test_fun_addr ;
// test for repeat time
int * keep_counting_addr;
int * reset_time_addr;
// test for burst on
int * make_noise_addr;
int * make_silence_addr;
// merge point for output
int * spi_out_addr ;
// amplitude envelope control
int * rise_amp_addr;
int * hold_amp_addr;
int * fall_amp_addr;
int * fall_test_addr;
int * mult_sample_addr;

void DMA_machine_program(void){
	
    // pacing timer 100 KHz set denom to 125000000/100000 = 1250
    // 200 KHz  is 625 counter divide
    // 500 KHz is 250 
    // can be user modified, or turned off if desired pacing timer3
    // 40 khz -- BUT better freq match if fraction is 0.8% smaller
    dma_timer_set_fraction ( 3, 1, 3100) ; //3125

    // enable sniffer add operation for DMA channel
    // this locks sniffer to execution channel, but the channel has to also enable on a per-block basis
    // sniffer_add
    dma_sniffer_enable(execute_chan, sniffer_add, true);

    // ================================================
    // define blocks to be executed
    // Build the DMA program
    // ================================================
    // Each of these DMA control blocks will be loaded by
    // the fetch channel into the execute channel, then the data will be moved
    // -- then the fetch channel is fixed and restarted to fetch the next block
    //================= ========== 
    // init block count. this is incremented by the build macro
    N = 0 ;  
    
    // define a jump target address label
    label(DMA_pgm_addr) ;

        // Timer 3 paces the whole loop
        // set to 50 KHz
        pacer(DREQ_DMA_TIMER3) ; 

        // Pulse on
        gpio_out(2, pin_hi) ;

        // accumulator add increment
        load_sniff(dds_accum);
        add_sniff(dds_inc, 1, var_read) ;
        store_sniff(dds_accum);

        // generate new random sample
        load_sniff(rand_seed) ;
        set_sniff_CRC32();
        crc32_sniff(rnd_reg, 1) ;
        set_sniff_add();
        store_sniff(rand_seed) ;
        // scale noise for DAC
        clr_sniff(sniff_dac_data_mask2) ;
        // set gain betwwen 0 and 16
        mult_sniff_var(rand_amp) ;
        logical_shift_right_4_sniff() ;
        store_sniff(rand_sample) ;

        // test for duration current_time - duration
        load_sniff(current_time) ;
        add_sniff(duration, 1, var_read) ;
        jump_neg(make_noise_addr, make_silence_addr);
        //
        label(make_noise_addr) ;
            // form pointer to next sine-table entry
            move_bswap(dds_accum, dma_hw->sniff_data, 1, DMA_SIZE_32);
            clr_sniff(clear_high_bytes) ;
            mult_sniff(4) ;
            add_sniff(sine_table_addr, 1, var_read) ;
            
            // sniff-data holds a POINTER to the sine value
            // need to convert that to a value
            ptr_to_value_sniff() ;
            // add in the noise sample
            add_sniff(rand_sample, 1, var_read);
            store_sniff(sine_sample) ;

            // envelope: rise - hold - fall
            // current_time - rise_time
            load_sniff(current_time) ;
            add_sniff(rise_time, 1, var_read) ;
            jump_neg(rise_amp_addr, fall_test_addr);
            //
            // during rise add 1 to amp (max 256)
            label(rise_amp_addr) ;
                load_sniff(current_amp);
                add_sniff(dma_one, 1, var_read);
                store_sniff(current_amp);
                jump(mult_sample_addr)
             
            label(fall_test_addr);
                // test for falling versus hlding
                load_sniff(current_time);
                add_sniff(fall_time, 1, var_read);
                jump_neg(hold_amp_addr, fall_amp_addr)
            //
            // during fall subtrazct one from amplitude
            label(fall_amp_addr);
                load_sniff(current_amp);
                add_sniff(dma_neg_one, 1, var_read);
                store_sniff(current_amp);
                jump(mult_sample_addr)
            //
            // during hold, set amp to maximum (256)
            label(hold_amp_addr);
                load_sniff(max_amp);
                store_sniff(current_amp);
            //
            // AM modulate sine sample with the current amplitude
            // mult by amp, divide by 256 to make 0 to 1 fraction
            label(mult_sample_addr) ;
                //load_sniff(sine_sample);
                load_sniff(sine_sample) ;
                mult_sniff_var(current_amp);
                logical_shift_right_8_sniff()             
                jump(spi_out_addr);
        //
        // if time is above duration cutoff, make the amp zero
        label(make_silence_addr);
            load_sniff(dma_zero);
            store_sniff(current_amp);
        //
        label(spi_out_addr) ;
        // =or= in the DAC control word
        or_sniff(dac_config_mask) ;
        // transfer 16 bits to spi0 data reg
        move(dma_hw->sniff_data, spi0_hw->dr, 1, DMA_SIZE_16, var_read, var_write);

        // inc time, but reset on repeat interval
        load_sniff(current_time);
        add_sniff(dma_one,1, var_read);
        store_sniff(current_time);
        // test for end
        // current_time - repeat_time
        add_sniff(repeat_time, 1, var_read);
        jump_neg(keep_counting_addr, reset_time_addr);
        label(reset_time_addr);
            load_sniff(dma_zero);
            store_sniff(current_time);
        label(keep_counting_addr) ;
        // signal end of sample
        gpio_out(2, pin_lo) ;
    // ===  unconditional jump to start of program
    jump(DMA_pgm_addr) ;

    // === END OF DMA PROGRAM ===
    //  just for bookkeeping and debugging
    printf("num block=%d\n\r", N-1);
}


// ==================================================
// === graphics demo -- RUNNING on core 0
// ==================================================

static PT_THREAD (protothread_graphics(struct pt *pt)) {
    PT_BEGIN(pt);
    // the protothreads interval timer
    PT_INTERVAL_INIT() ;

    // background
    fillRect(0, 0, 319, 239, BLACK); // 

    // Draw some filled rectangles
    fillRect(0, 0, 76, 10, BLUE); // blue box
    fillRect(100, 0, 150, 10, WHITE); // red box
    //fillRect(200, 0, 76, 10, GREEN); // green box

    // Write some text
    setTextColor(WHITE) ;
    setCursor(10, 1) ;
    setTextSize(1) ;
    writeString("ECE 4760") ;

    setTextColor(BLACK) ;
    setCursor(102, 1) ;
    setTextSize(1) ;
    writeString("VGA and DMAcpu ") ;

    while(true) {
        // new_value set by serial thread
        PT_YIELD_UNTIL(pt, new_value==true) ;
        new_value = false ;
        
        
   }
   PT_END(pt);
} // graphics thread

// ==================================================
// === toggle25 thread on core 0
// ==================================================
// the on-board LED blinks
static PT_THREAD (protothread_toggle25(struct pt *pt))
{
    PT_BEGIN(pt);
    static bool LED_state = false ;
    
     // set up LED p25 to blink
     gpio_init(25) ;	
     gpio_set_dir(25, GPIO_OUT) ;
     gpio_put(25, true);
     //gpio_init(2) ;	
     //gpio_set_dir(2, GPIO_OUT) ;
     // data structure for interval timer
     PT_INTERVAL_INIT() ;

      while(1) {
        // yield time 0.1 second
        //PT_YIELD_usec(100000) ;
        PT_YIELD_INTERVAL(100000) ;

        // toggle the LED on PICO
        LED_state = LED_state? false : true ;
        gpio_put(25, LED_state);

        //while(1){
        //    LED_state = LED_state? false : true ;
         //   gpio_put(2, LED_state);
        //}

        // sanity check print
        //printf("%8x\n\r", dma_rand_out) ;
        //
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // blink thread


// ==================================================
// === user's serial input thread on core 0
// ==================================================
// serial_read an serial_write do not block any thread
// except this one
static int r=7, g=7, b=3 ;
static PT_THREAD (protothread_serial(struct pt *pt))
{
    PT_BEGIN(pt);
        char video_buffer[20];
        float freq ;
        int dur, rep;
      //
      while(1) {
        // print prompt
        //sprintf(pt_serial_out_buffer, "input n/b, scale(1-8), gaussN(1-20): ");
        sprintf(pt_serial_out_buffer, "input frequency, duration, repeat, noise gain(0-16): ");
        // spawn a thread to do the non-blocking write
        serial_write ;

        // spawn a thread to do the non-blocking serial read
         serial_read ;
        //sscanf(pt_serial_in_buffer, "%c %d %d ", &dist_type, &scale, &gaussN) ;
        sscanf(pt_serial_in_buffer, "%f %d %d %d", &freq, &dur, &rep, &rand_amp) ;
        dds_inc = (int) (freq * 4294967296 / Fs) ;
        // 
        repeat_time = -rep;
        duration = -dur ;
        current_time = 0 ;
        fall_time = duration + 256 ; 
        if(rand_amp > 16) rand_amp = 16 ;
        if(rand_amp < 0) rand_amp = 0 ;

        //PT_YIELD_usec(100);

        new_value = true ;
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // serial thread


// ==========================================
// === Ring Osc random bit RNG
// setup for higher speed oscillator
// ==========================================

void rosc_setup(void){
  volatile uint32_t *rosc_div = (uint32_t *)(ROSC_BASE + ROSC_DIV_OFFSET) ;
  volatile uint32_t *rosc_ctl = (uint32_t *)(ROSC_BASE + ROSC_CTRL_OFFSET) ;
  volatile uint32_t *rosc_freqA = (uint32_t *)(ROSC_BASE + ROSC_FREQA_OFFSET) ;
  volatile uint32_t *rosc_freqB = (uint32_t *)(ROSC_BASE + ROSC_FREQB_OFFSET) ;
  
  // set divider to one for frequency measurement
  *rosc_div = ROSC_DIV_VALUE_PASS + 1 ;
  // speed up the ROSC so more cycles between reads
  // (dont use ROSC_CTRL_FREQ_RANGE_VALUE_TOOHIGH)
  // Measured at 241 MHz with theses settings
  *rosc_ctl =  ROSC_CTRL_FREQ_RANGE_VALUE_HIGH ;// | ROSC_CTRL_ENABLE_VALUE_ENABLE;
  *rosc_freqA = (ROSC_FREQA_PASSWD_VALUE_PASS<<16) | 0xffff ;
  *rosc_freqB = (ROSC_FREQB_PASSWD_VALUE_PASS<<16) | 0xffff ;
}

// ==========================================================
// basic random number gen with Von Neumann extractor
// and a small delay in the extractor
// AND one pass through rand() function
// https://en.wikipedia.org/wiki/Randomness_extractor
// ==========================================================

uint32_t rand_rosc_VN(void){
    int k, random ;
    int random_bit1, random_bit2 ;
    volatile uint32_t *rnd_reg = (uint32_t *)(ROSC_BASE + ROSC_RANDOMBIT_OFFSET);
    volatile uint32_t *rosc_ctl = (uint32_t *)(ROSC_BASE + ROSC_CTRL_OFFSET) ;
    
    for(k=0;k<32;k++){
      // von Neumann bit extractor         
      while(1){
        //extractor_count++ ;
        random_bit1=0x00000001 & (*rnd_reg);
        //  a small delay decorrelates the bits
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;      
        random_bit2=0x00000001 & (*rnd_reg);
        // if the two are diferent, use the first one
        if(random_bit1!=random_bit2) break;
      }  
      // build the 32 bit sample
      random=(random << 1) | random_bit1 ;
    }
    srand(random) ;
    //rand_count++ ;
    random = rand() ;
    return random;
}


// ========================================
// === core 1 main -- started in main below
// ========================================
void core1_main(){ 
  //
  //  === add threads  ====================
  // for core 1
  //pt_add_thread(protothread_toggle_gpio4) ;
  //pt_add_thread(protothread_serial) ;
  //
  // === initalize the scheduler ==========
  pt_schedule_start ;
  // NEVER exits
  // ======================================
}

// ========================================
// === core 0 main
// ========================================
int main(){
    // set the clock
    //set_sys_clock_khz(250000, true); // 171us
    // start the serial i/o
    stdio_init_all() ;
    // announce the threader version on system reset
    printf("\n\rProtothreads RP2040 v1.11 two-core\n\r");

    // Initialize the VGA screen
    initVGA() ;

    // Initialize SPI channel (channel, baud rate set to 20MHz)
    // connected to spi DAC
    spi_init(SPI_PORT, 20000000) ;
    // Format (channel, data bits per transfer, polarity, phase, order)
    spi_set_format(SPI_PORT, 16, 0, 0, 0);
    // Map SPI signals to GPIO ports
    //gpio_set_function(PIN_MISO, GPIO_FUNC_SPI);
    gpio_set_function(PIN_SCK, GPIO_FUNC_SPI);
    gpio_set_function(PIN_MOSI, GPIO_FUNC_SPI);
    gpio_set_function(PIN_CS, GPIO_FUNC_SPI) ;

    // dds table 12 bit values
    // must or in control word in DMAcpu
    for(int i=0; i<256; i++) {
        // sine table is in naural +1/-1 range
        sine_table[i] = (int)(1024 + 1000 * sin(2*3.1416*i/256))  ;
    }

    //start the ROSC and speed it up
    rosc_setup();

    // seed the CRC computation
    rand_seed = rand_rosc_VN() ;


    // define the DMA program
    DMA_machine_program() ;

    // turn on the DMA coprocessor   
    // runs automomously after this
    DMA_machine_start() ;
     
  // start core 1 threads
  //multicore_reset_core1();
  //multicore_launch_core1(&core1_main);

  // === config threads ========================
  // for core 0
  pt_add_thread(protothread_graphics);
  pt_add_thread(protothread_toggle25);
  pt_add_thread(protothread_serial) ;
  //
  // === initalize the scheduler ===============
  pt_schedule_start ;
  // NEVER exits
  // ===========================================
} // end main