 /* 

 --- DMA machine (DMAcpu) --- DMA-CpU
 This is a fetch-execute cpu where the 'fetch' function is done by one DMA block, which
 loads DMA control block images from RAM into another 'execution' DMA block. The 'program' which is
 loaded consists of a carefully crafted series of memory copy commands which together
 act as a general purpose computer. The design is made easier by several transport-
 triggered actions in the DMA subsystem. These include an adder in the 'channel sniffer'
 and atomic SET/CLEAR/XOR write functions on all SFRs, including the sniffer.

 No ARM cpu resources are used when the DMAcpu is running. Other DMA channels can
 be used as needed, including to build another copy of the DMAcpu.

 This example implements white noise generation, followed by
 low pass filtering with a 1-pole IIR filter

*/

#include "hardware/gpio.h"
#include "hardware/timer.h"
#include "hardware/spi.h"
#include "pico/stdlib.h"
#include "hardware/uart.h"
#include "stdio.h"
#include <string.h>
#include <pico/multicore.h>
#include "hardware/sync.h"
#include "hardware/dma.h"
#include "hardware/gpio.h"
#include "math.h"
#include "hardware/structs/iobank0.h"
#include "hardware/regs/rosc.h"

// ==========================================
// === protothreads setup
// ==========================================
// protothreads header
#include "pt_cornell_rp2040_v1.1.h"

// === global thread + DMA communicaiton
// must be volatile because of DMA modification
// a zero load variable -- dont change
volatile int dma_zero = 0 ;
// a one-load variable -- dont change
volatile int dma_one = 1 ;
volatile int dma_neg_one = -1 ;

// noiseP vars
volatile int dma_noise_temp = 0;
volatile int dma_lp_old_out = 0;
volatile int dma_lp_old_out_shifted = 0;
volatile int dma_lp_input = 0;
volatile int dma_lp_input_shifted = 0;
volatile int dma_sniff_temp = 0 ;

// signal from user serial thead
volatile int dma_flag = 2 ;

// ========================================
// === spi setup (not used in this program)
// =======================================
//SPI configurations
#define PIN_CS   5
#define PIN_SCK  6
#define PIN_MOSI 7
#define SPI_PORT spi0

// constant to tell SPI DAC what to do
// prepend to each 12-bit sample
#define DAC_config_chan_A 0b0011000000000000
// B-channel, 1x, active
#define DAC_config_chan_B 0b1011000000000000

// ==========================================
// === DMA machine setup
// ==========================================
//  DMA control block size in words:
#define length_of_block 4
// the main DMA block program list
#define max_blocks 100
int DMA_blocks[max_blocks * length_of_block];
// base address of DMA block list being defined
 int * DMA_blocks_addr = &DMA_blocks[0] ;

// counter for the current block to create
int N = 0;
// /dev/null - a data sink
int bit_bucket ;

// make it easier to change channels
// so that the machine will work with other software, like video
#define fetch_chan 5
#define execute_chan 3
#define fix_chan 4

// address of current block
#define current_block_addr (&DMA_blocks[4*(N)])
// address of the next block
#define next_block_addr (&DMA_blocks[4*(N+1)])
// address of the  block 2 ahead
#define next_block2_addr (&DMA_blocks[4*(N+2)])
// adderss of arbitrary block
#define block_addr(N)    (&DMA_blocks[4*(N)])

// macro to build a DMA block
// inputs: read addr, write addr, count, ctrl word (4 bytes each)
// puts a 16 byte channel control block image in the DMA_blocks array at the current block count, N, 
// then increments the block count N
// read_addr is put in array location N*length_of_block,
// write address is put in array location N*length_of_block+1,
#define build_block(read_addr, write_addr, count, ctrl) \
    do { \
        DMA_blocks[4*N] = (int)read_addr; \
        DMA_blocks[4*N+1] = (int)write_addr; \
        DMA_blocks[4*N+2] = count; \
        DMA_blocks[4*N+3] = ctrl; \
        N++ ; \
    } while(0)

// macros for defining the DMA CNTL word bits
// --This duplicates some of the SDK --
// but i like having it here
// data_width is 0==byte 1==short 2==int
#define DMA_DATA_WIDTH(data_width) ((data_width & 0x03)<<2) 
// give this channel more access if several channels aare on
#define DMA_HIGH_PRI  (1<<1) 
// turn on the channel
#define DMA_EN  1  
// increment or keep constant wrrite nd read addr
// useful for peripheril write/read
#define DMA_WR_INC  (1<<5) 
#define DMA_RD_INC  (1<<4) 
// enable this channel for sniffer
#define SNIFF_EN (1<<23)
// reverse order of bytes when channel is transmitting
#define BSWAP (1<<22)
//turn off channel done IRQ
#define DMA_IRQ_QUIET  (1<<21)
// More tirgger info page 114
// 0x0 to 0x3a == select DREQ n as TREQ from table above
// 0x3b == Select Timer 0 as TREQ
// 0x3c == Select Timer 1 as TREQ
// 0x3d == Select Timer 2 as TREQ (Optional)
// 0x3e == Select Timer 3 as TREQ (Optional)
// 0x3f == Permanent request, for unpaced transfers.
// bits 15:20 trigger request source
#define DMA_TREQ(trigger_source)  ((trigger_source & 0x3f)<<15) 
// When this channel completes, it will trigger the channel
// indicated by CHAIN_TO. Disable by setting CHAIN_TO = (this channel).
// bits 11:14 next chnnel #
#define DMA_CHAIN_TO(next_ch) ((next_ch & 0x0f)<<11) 
#define STANDARD_CTRL (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN)

// DMA sniffer atomic write operations 
// used to simulate an accumulator logic operations
// sniffer can also compute add, CRC, parity, and bit-inversion
#define sniffer_add 0x0f
#define sniffer_crc32 0x00
#define DMA_SNIFF_DATA_SET   (0x2438 + 0x50000000)
#define DMA_SNIFF_DATA_XOR   (0x1438 + 0x50000000)  
#define DMA_SNIFF_DATA_CLR   (0x3438 + 0x50000000)
#define DMA_SNIFF_CTRL_SET   (0x2434 + 0x50000000)
#define DMA_SNIFF_CTRL_XOR   (0x1434 + 0x50000000)  
#define DMA_SNIFF_CTRL_CLR   (0x3434 + 0x50000000)
// logic:
// load A then set using B as mask implements A OR  B
// load B then CLR bits using NOT(A) as a mask implements A AND B
// load A then XOR bits using B as mask implements A XOR B
// load A then XOR bits with 0xffffffff implements NOT A

// default will be add -- BUT function can be controlled by DMA machine
// these two sniffer congtrol options invert or reveses the bits when WRITING sniffer to 
// some location
#define OUT_INV (1<<11)
#define OUT_REV (1<<10)
// calc field is 4 bits 5:8: all bits set is add, zero is CRC32
#define CALC_ADD (0xf<<5)
#define CALC_CRC (0x0<<5)
int sniff_inv_mask = OUT_INV ;
int sniff_rev_mask = OUT_REV ;
int sniff_calc_mask = 0xf << 5 ;
// invert all bits when using DMA_SNIFF_DATA_XOR
int sniff_xor_mask = 0xffffffff ;
// clear all but a few bits to generate offsets for jumping
// after a BSWAP puts the sign bits into bits 7:4
int sniff_offset16_mask = 0xffffffef ;
int sniff_offset32_mask = 0xffffffdf ;
int sniff_offset48_mask = 0xffffffcf ;
int sniff_dac_data_mask = 0xfffff000 ;
int dac_config_mask_A = DAC_config_chan_A;
int dac_config_mask_B = DAC_config_chan_B;

// define gpio2 direct write registers
int pin_on = 0x3300 ; // sio_hw->gpio_set = pin_on
int pin_off = 0x3200 ; // sio_hw->gpio_clr = pin_off; 0x3200

// The address of the execute channel read
int DMA_execute_addr = (DMA_BASE + execute_chan * DMA_CH1_READ_ADDR_OFFSET) ;
// and point to it
int * DMA_execute_addr_ptr = &DMA_execute_addr ;

// pointers to specific program locations for jump tests
// note that these are evaluated by the DMA blocks at runtime
// based on assignments at compile time.
// Each is used to uniquely specify the target of a jump operation
int * jump_skip_zero_addr ;
int * jump_compare_zero_addr ;
int * jump_compare_end_addr ;

// ==========================================
// === Ring Osc random bit RNG
// setup for higher speed oscillator
// used for random bit
// ==========================================
volatile uint32_t *rnd_reg = (uint32_t *)(ROSC_BASE + ROSC_RANDOMBIT_OFFSET);
void rosc_setup(void){
  volatile uint32_t *rosc_div = (uint32_t *)(ROSC_BASE + ROSC_DIV_OFFSET) ;
  volatile uint32_t *rosc_ctl = (uint32_t *)(ROSC_BASE + ROSC_CTRL_OFFSET) ;
  volatile uint32_t *rosc_freqA = (uint32_t *)(ROSC_BASE + ROSC_FREQA_OFFSET) ;
  volatile uint32_t *rosc_freqB = (uint32_t *)(ROSC_BASE + ROSC_FREQB_OFFSET) ;
  
  // set divider to one for frequency measurement
  *rosc_div = ROSC_DIV_VALUE_PASS + 1 ;
  // speed up the ROSC so more cycles between reads
  // (dont use ROSC_CTRL_FREQ_RANGE_VALUE_TOOHIGH)
  // Measured at 241 MHz with theses settings
  *rosc_ctl =  ROSC_CTRL_FREQ_RANGE_VALUE_HIGH ;// | ROSC_CTRL_ENABLE_VALUE_ENABLE;
  *rosc_freqA = (ROSC_FREQA_PASSWD_VALUE_PASS<<16) | 0xffff ;
  *rosc_freqB = (ROSC_FREQB_PASSWD_VALUE_PASS<<16) | 0xffff ;
}
// =======================================================
// set up the fetch/execute, to start the machine
// set machine (can be user modified)
// =======================================================
void DMA_machine_start(void) {
    
    // pacing timer 100 KHz set denom to 125000000/100000 = 1250
    // 200 KHz  is 625 counter divide
    // 500 KHz is 250 
    // can be user modified, or turned off if desired
    // 50 KHz is 2500
    dma_timer_set_fraction ( 3, 1, 2500) ;

    // enable sniffer add operation for DMA channel 1 
    // this locks sniffer to chan 1, but DMA1 has to also enable on a per-block basis
    dma_sniffer_enable(execute_chan, sniffer_add, true);

    // ======================================================
    // execution machine -- fetch/execute state machine
    // it is very unlikely that you should modify this
    // ======================================================
    // === fix module to reset write address of fetch channel
    // always set to  execute channel write address
    dma_channel_config c2 = dma_channel_get_default_config(fix_chan);
    channel_config_set_transfer_data_size(&c2, DMA_SIZE_32);
    channel_config_set_read_increment(&c2, false);
    channel_config_set_write_increment(&c2, false);
    channel_config_set_irq_quiet(&c2, true);
    channel_config_set_enable(&c2, true); 
    channel_config_set_chain_to(&c2, fetch_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c2,  DREQ_FORCE);
    //
    dma_channel_configure(fix_chan, &c2, 
        &dma_hw->ch[fetch_chan].write_addr , // reset the fetch  to write to execute channel 
        DMA_execute_addr_ptr ,  // read_addr, pointer to address of execute channel
        1, // one words per DMA block
        false) ; // triggered to start machine running

    // === The fetch module
    // this is the program counter and fetch unit
    // when this code is executed it starts the DMAcpu machine!
    // NOTE that this moddule could be paced by a timer or other TREQ 
    //    for DDS, or other operations
    dma_channel_config c0 = dma_channel_get_default_config(fetch_chan);
    channel_config_set_transfer_data_size(&c0, DMA_SIZE_32);
    channel_config_set_read_increment(&c0, true);
    channel_config_set_write_increment(&c0, true);
    channel_config_set_irq_quiet(&c0, true);
    channel_config_set_enable(&c0, true); 
    channel_config_set_chain_to(&c0, execute_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c0,  DREQ_FORCE);
    //
    dma_channel_configure(fetch_chan, &c0, 
        &dma_hw->ch[execute_chan].read_addr , // write to dma channel 1
        DMA_blocks_addr ,  // read_addr, start of DMA blocks list
        4, // four words per DMA block
        true) ; // triggered to start machine running
}

// ================================================
// === User written DMA program
// ================================================
void DMA_machine_program(void){
	// init block count
    N = 0 ;  
    // ================================================
    // define blocks to be executed
    // Build the DMA program
    // ================================================
    // build_block(read_addr, write_addr, count, ctrl)
    // ================================================
    // Each of these DMA control blocks will be loaded by
    // the fetch channel into the execute channel, then the data will be moved
    // -- then the fetch channel is fixed and restarted to fetch the next block
    //================= ========== 
    // Pulse on
    build_block(&pin_on,  &iobank0_hw->io[2].ctrl, 1, STANDARD_CTRL);
    // pulse off
    build_block(&pin_off,  &iobank0_hw->io[2].ctrl, 1, STANDARD_CTRL);

    // dma_sniffer_ set to add
    build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);
    // === load a random bit to sniff data reg: dma_hw->sniff_data
    build_block(rnd_reg,  &dma_hw->sniff_data, 1, STANDARD_CTRL);
    // == pass var thru the sniffer twice to make shift register 
     build_block(&dma_noise_temp, &bit_bucket, 2, STANDARD_CTRL | SNIFF_EN) ;
     // store back to var
    build_block( &dma_hw->sniff_data, &dma_noise_temp, 1, STANDARD_CTRL);

    // take the shifted random bits as a crc seed
    //  dma_sniffer_ set to CRC32
    build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);
    // compute CRC32
    build_block(&dma_noise_temp, &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN) ;

    // dma_sniffer_ set to add
    build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);
    // limit sniffer data to 12 bits
    build_block( &sniff_dac_data_mask, DMA_SNIFF_DATA_CLR, 1, STANDARD_CTRL);
    // and save to low pass filter input
    build_block(&dma_hw->sniff_data, &dma_lp_input, 1, STANDARD_CTRL) ;

    // form: output = old_output + [(input-old_output) >> n] lowpass
    // shift old output
    // shift new input
    // subtract the two
    // add old output
  
    // shift new input right use ONLY: 1, 2, 4, 8, 16, 32
    #define shift_bits 4
    // == store the sniff data reg to temp with bit revers tyrned on
    // reverse bit order by reading back to F with bitrev turned on
    // snifffer contains new input
    build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);
    build_block( &dma_hw->sniff_data, &dma_sniff_temp, 1, STANDARD_CTRL);
    // read eversed temp back into sniffer
    build_block(&dma_zero,  &dma_hw->sniff_data, 1, STANDARD_CTRL); //&dma_sniff_temp
    // == pass var thru the sniffer to the bit_bucket  dma_flag times 
    // build_block(&dma_flag, next_block_addr+2 , 1, STANDARD_CTRL );
    // (left shift 4 bits on REVERSED data)
    build_block(&dma_sniff_temp, &bit_bucket, shift_bits, STANDARD_CTRL | SNIFF_EN) ;
    // reverse bit order back to oroginal order, but shifted
    // == store the sniff data reg back to shifted version
    build_block( &dma_hw->sniff_data, &dma_lp_input_shifted, 1, STANDARD_CTRL);
     // !!turn off reverse flag!!
     build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);

     // shift old output right 4 bits
    // == store the sniff data reg to temp with bit revers tyrned on
    // reverse bit order by reading back to F with bitrev turned on
    build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_SET, 1, STANDARD_CTRL);
    build_block(&dma_lp_old_out, &dma_hw->sniff_data, 1, STANDARD_CTRL) ;
    build_block( &dma_hw->sniff_data, &dma_sniff_temp, 1, STANDARD_CTRL);
    // read eversed temp back into sniffer
    build_block(&dma_zero,  &dma_hw->sniff_data, 1, STANDARD_CTRL);
    // == pass var thru the sniffer to the bit_bucket  dma_flag times 
    // build_block(&dma_flag, next_block_addr+2 , 1, STANDARD_CTRL );
    // (left shift 4 bits on REVERSED data)
    build_block(&dma_sniff_temp, &bit_bucket, shift_bits, STANDARD_CTRL | SNIFF_EN) ;
    // reverse bit order back to oroginal order, but shifted
    // == store the sniff data reg back to shifted version
    build_block( &dma_hw->sniff_data, &dma_lp_old_out_shifted, 1, STANDARD_CTRL);
     // !!turn off reverse flag!!
     build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);

    // subtract shifted input-old_out
    // === load old_output var to sniff data reg: dma_hw->sniff_data
    build_block(&dma_lp_old_out_shifted,  &dma_hw->sniff_data, 1, STANDARD_CTRL);
    // invert sniffer data bits using sniffer xor write
    build_block(&sniff_xor_mask,  DMA_SNIFF_DATA_XOR, 1, STANDARD_CTRL);
    // add 1 to form two's complement (i.e. -B)
     build_block(&dma_one,  &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN);
    // == add input by passing A var thru the sniffer to the bit_bucket
     build_block(&dma_lp_input_shifted,  &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN);
    // sniff data contains  [(input-old_output) >> n]
    // add old_out
    // add old_output
    build_block(&dma_lp_old_out, &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN) ;
    // store
    build_block(&dma_hw->sniff_data, &dma_lp_old_out, 1, STANDARD_CTRL) ;

    // OR in the DAC control word
    build_block( &dac_config_mask_A, DMA_SNIFF_DATA_SET, 1, STANDARD_CTRL);
    // send filtered noise to DAC channel A
    build_block(&dma_hw->sniff_data,  &spi0_hw->dr, 1, 
        (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_16) | DMA_IRQ_QUIET | DMA_EN));

    // now send the unfiltered white noise to channel B
    build_block( &dma_lp_input, &dma_hw->sniff_data, 1, STANDARD_CTRL) ;
    build_block( &dac_config_mask_B, DMA_SNIFF_DATA_SET, 1, STANDARD_CTRL);
    // send to DAC
    build_block(&dma_hw->sniff_data,  &spi0_hw->dr, 1, 
        (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_16) | DMA_IRQ_QUIET | DMA_EN));

    // end marker
    // === set pin
    build_block(&pin_on, &iobank0_hw->io[2].ctrl, 1, STANDARD_CTRL);
    // TARGET if dma_flag==2
    // == === clear the pin
    build_block(&pin_off,  &iobank0_hw->io[2].ctrl, 1, STANDARD_CTRL);
     
    //================= ======= 

    // = ===  unconditional jump to start of program
    // push the DMA_blocks[0] address into the program counter (fetch channel read pointer)
    // !!NOTE that this block throttles the machine to the frequency of Timer 3 !!
    // set here to 200 KHz. 
    // To run at full DMA speed, change to DMA_TREQ(DREQ_FORCE)
    build_block(&DMA_blocks_addr, &dma_hw->ch[fetch_chan].read_addr, 1, 
        DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_DMA_TIMER3) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    // === END OF DMA PROGRAM ===
    //  
}

// ==================================================
// === user's serial input thread
// ==================================================
// serial_read an serial_write do not block any thread
// except this one
static PT_THREAD (protothread_serial(struct pt *pt))
{
    PT_BEGIN(pt);
      static int test_in1, test_in2, sum ;
      //
      while(1) {
        // print prompt
        sprintf(pt_serial_out_buffer, "shift 2, 4, 8 ,16, 32: ");
        // spawn a thread to do the non-blocking write
        serial_write ;

        // spawn a thread to do the non-blocking serial read
         serial_read ;
        // convert input string to number
        sscanf(pt_serial_in_buffer,"%d ", &dma_flag) ;
        // 

       PT_YIELD_usec(50) ;
       // printf("A+B=%d  A-B=%d  (A<<1)=%d  (A>>1)=%d  (A<0)=%d\n\r", 
        //    dma_var_C, dma_var_E, dma_var_D, dma_var_F, dma_var_G) ;
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // timer thread

// ==================================================
// === toggle25 thread 
// ==================================================
// the on-board LED blinks
static PT_THREAD (protothread_toggle25(struct pt *pt))
{
    PT_BEGIN(pt);
    static bool LED_state = false ;
    
     // set up LED p25 to blink
     gpio_init(25) ;	
     gpio_set_dir(25, GPIO_OUT) ;
     gpio_put(25, true);
      while(1) {
        // yield time 0.1 second
        PT_YIELD_usec(10000) ;
        // toggle the LED on PICO
        LED_state = LED_state? false : true ;
        gpio_put(25, LED_state);

        //printf("%08x\n\r", dma_var_D);
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // blink thread


// ========================================
// === core 1 main -- started in main below
// ========================================
void core1_main(){ 
  
  //  === add threads  ====================
  // for core 1
  // none
  // === initalize the scheduler ==========
  pt_schedule_start ;
  // NEVER exits
  // ======================================
}

// ========================================
// === core 0 main
// ========================================
int main(){
  // start the serial i/o
  stdio_init_all() ;
  // announce the threader version on system reset
  printf("\n\rProtothreads RP2040 v1.1 two-core\n\r");


    // Initialize SPI channel (channel, baud rate set to 20MHz)
    // connected to spi DAC
    spi_init(SPI_PORT, 20000000) ;
    // Format (channel, data bits per transfer, polarity, phase, order)
    spi_set_format(SPI_PORT, 16, 0, 0, 0);
    // Map SPI signals to GPIO ports
    //gpio_set_function(PIN_MISO, GPIO_FUNC_SPI);
    gpio_set_function(PIN_SCK, GPIO_FUNC_SPI);
    gpio_set_function(PIN_MOSI, GPIO_FUNC_SPI);
    gpio_set_function(PIN_CS, GPIO_FUNC_SPI) ;

  //start the ROSC and spped it up
  rosc_setup();

  // define the DMA program
  DMA_machine_program() ;

  // turn on the DMA coprocessor   
  // runs automomously after this
  DMA_machine_start() ;

  // start core 1 threads
  //multicore_reset_core1();
  //multicore_launch_core1(&core1_main);

  // === config threads ========================
  // for core 0
  pt_add_thread(protothread_serial);
  pt_add_thread(protothread_toggle25);
  //
  // === initalize the scheduler ===============
  pt_schedule_start ;
  // NEVER exits
  // ===========================================
} // end main
///////////
// end ////
///////////