 /* 

 --- DMA machine (DMAcpu) --- DMA-CpU
 This is a fetch-execute cpu where the 'fetch' function is done by one DMA block, which
 loads DMA control block images from RAM into another 'execution' DMA block. The 'program' which is
 loaded consists of a carefully crafted series of memory copy commands which together
 act as a general purpose computer. The design is made easier by several transport-
 triggered actions in the DMA subsystem. These include an adder in the 'channel sniffer'
 and atomic SET/CLEAR/XOR write functions on all SFRs, including the sniffer.

 No ARM cpu resources are used when the DMAcpu is running. Other DMA channels can
 be used as needed, including to build another copy of the DMAcpu.

 This example implements add, subtract, sjift-left, shift-right, and a couple of different
 ways of generating a conditional jump

# ====================================
# === Register write functions =======
# Each peripheral register block is allocated 4kB of address space,
# with registers accessed using one of 4 methods,selected by
# address decode.
# Addr + 0x0000 : normal read write access
# Addr + 0x1000 : atomic XOR on write
# Addr + 0x2000 : atomic bitmask set on write
# Addr + 0x3000 : atomic bitmask clear on write

======================================
DMA channels can only get to GPIO through PAD  override registers
//force enable output and output 1
iobank0_hw->io[2].ctrl = 0x3300 ;
// force enable and output zero
iobank0_hw->io[2].ctrl = 0x3200 ;

=== DAC ===
GPIO 5 (pin 7) Chip select -- spi hardware runs this without user code
GPIO 6 (pin 9) SCK/spi0_sclk
GPIO 7 (pin 10) MOSI/spi0_tx
3.3v (pin 36) -> VCC on DAC 
GND (pin 3)  -> GND on DAC

*/

#include "hardware/gpio.h"
#include "hardware/timer.h"
#include "hardware/spi.h"
#include "pico/stdlib.h"
#include "hardware/uart.h"
#include "stdio.h"
#include <string.h>
#include <pico/multicore.h>
#include "hardware/sync.h"
#include "hardware/dma.h"
#include "hardware/gpio.h"
#include "math.h"
#include "hardware/structs/iobank0.h"

// ==========================================
// === protothreads setup
// ==========================================
// protothreads header
#include "pt_cornell_rp2040_v1.1.h"

// === global thread+DMA communicaiton
// must be volatile because of DMA modification
// a zero load variable -- dont change
volatile int dma_zero = 0 ;
// a one-load variable -- dont change
volatile int dma_one = 1 ;
//  test variables
volatile int dma_var_A = 0 ;
volatile int dma_var_B = 0 ;
volatile int dma_var_C = 0 ;
volatile int dma_var_D = 0 ;
volatile int dma_var_E = 0 ;
volatile int dma_var_F = 0 ;
volatile int dma_var_G = 2 ;

// signal from user serial thead
volatile int dma_flag = 0 ;

// ========================================
// === spi setup (not used in this program)
// =======================================
//SPI configurations
#define PIN_CS   5
#define PIN_SCK  6
#define PIN_MOSI 7
#define SPI_PORT spi0

// ==========================================
// === DMA machine setup
// ==========================================
//  DMA control block size in words:
#define length_of_block 4
// the main DMA block program list
#define max_blocks 100
int DMA_blocks[max_blocks * length_of_block];
// base address of DMA block list being defined
 int * DMA_blocks_addr = &DMA_blocks[0] ;
// counter for the current block to create
int N = 0;
// /dev/null
int bit_bucket ;

// A macro to assign a sequence label value
// just copies the next block count to a label name  e.g. label1=branch_label() ;
// inserting this statment gives a name to a jump-target
//#define branch_label() N 
// convert the label variable to an absolute address
// DMA_blocks is a pointer to the array of block images
//#define branch_label_to_addr(label) ( DMA_blocks + (label)*length_of_block )
// address of current block
#define current_block_addr (&DMA_blocks[4*(N)])
// address of the next block
#define next_block_addr (&DMA_blocks[4*(N+1)])
// address of the  block 2 ahead
#define next_block2_addr (&DMA_blocks[4*(N+2)])
// adderss of arbitrary block
#define block_addr(N)    (&DMA_blocks[4*(N)])

// macro to build a DMA block
// inputs: read addr, write addr, count, ctrl word (4 bytes each)
// puts a 16 byte channel control block image in the DMA_blocks array at the current block count, N, 
// then increments the block count N
// read_addr is put in array location N*length_of_block,
// write address is put in array location N*length_of_block+1,
#define build_block(read_addr, write_addr, count, ctrl) \
    do { \
        DMA_blocks[4*N] = (int)read_addr; \
        DMA_blocks[4*N+1] = (int)write_addr; \
        DMA_blocks[4*N+2] = count; \
        DMA_blocks[4*N+3] = ctrl; \
        N++ ; \
    } while(0)

// macros for defining the DMA CNTL word bits
// --This duplicates some of the SDK --
// but i like having it here
// data_width is 0==byte 1==short 2==int
#define DMA_DATA_WIDTH(data_width) ((data_width & 0x03)<<2) 
// give this channel more access if several channels aare on
#define DMA_HIGH_PRI  (1<<1) 
// turn on the channel
#define DMA_EN  1  
// increment or keep constant wrrite nd read addr
// useful for peripheril write/read
#define DMA_WR_INC  (1<<5) 
#define DMA_RD_INC  (1<<4) 
// enable this channel for sniffer
#define SNIFF_EN (1<<23)
// reverse order of bytes when channel is transmitting
#define BSWAP (1<<22)
//turn off channel done IRQ
#define DMA_IRQ_QUIET  (1<<21)
// More tirgger info page 114
// 0x0 to 0x3a == select DREQ n as TREQ from table above
// 0x3b == Select Timer 0 as TREQ
// 0x3c == Select Timer 1 as TREQ
// 0x3d == Select Timer 2 as TREQ (Optional)
// 0x3e == Select Timer 3 as TREQ (Optional)
// 0x3f == Permanent request, for unpaced transfers.
// bits 15:20 trigger request source
#define DMA_TREQ(trigger_source)  ((trigger_source & 0x3f)<<15) 
// When this channel completes, it will trigger the channel
// indicated by CHAIN_TO. Disable by setting CHAIN_TO = (this channel).
// bits 11:14 next chnnel #
#define DMA_CHAIN_TO(next_ch) ((next_ch & 0x0f)<<11) 

// DMA sniffer atomic write operations 
// used to simulate an accumulator logic operations
// sniffer can also compute add, CRC, parity, and bit-inversion
#define sniffer_add 0x0f
#define sniffer_crc32 0x00
#define DMA_SNIFF_DATA_SET   (0x2438 + 0x50000000)
#define DMA_SNIFF_DATA_XOR   (0x1438 + 0x50000000)  
#define DMA_SNIFF_DATA_CLR   (0x3438 + 0x50000000)
#define DMA_SNIFF_CTRL_SET   (0x2434 + 0x50000000)
#define DMA_SNIFF_CTRL_XOR   (0x1434 + 0x50000000)  
#define DMA_SNIFF_CTRL_CLR   (0x3434 + 0x50000000)
// logic:
// load A then set using B as mask implements A OR  B
// load B then CLR bits using NOT(A) as a mask implements A AND B
// load A then XOR bits using B as mask implements A XOR B
// load A then XOR bits with 0xffffffff implements NOT A

// default will be add -- BUT function can be controlled by DMA machine
// these two sniffer congtrol options invert or reveses the bits when WRITING sniffer to 
// some location
#define OUT_INV (1<<11)
#define OUT_REV (1<<10)
// calc field is 4 bits 5:8: all bits set is add, zero is CRC32
#define CALC_ADD (0xf<<5)
#define CALC_CRC (0x0<<5)
int sniff_inv_mask = OUT_INV ;
int sniff_rev_mask = OUT_REV ;
int sniff_calc_mask = 0xf << 5 ;
// invert all bits when using DMA_SNIFF_DATA_XOR
int sniff_xor_mask = 0xffffffff ;
// clear all but bit a few bits to generate offsets for jumping
int sniff_offset16_mask = 0xffffffef ;
int sniff_offset32_mask = 0xffffffdf ;
int sniff_offset48_mask = 0xffffffcf ;

// init the machine and define some global constants
// gpio 2
int pin_on = 0x3300 ; // sio_hw->gpio_set = pin_on
int pin_off = 0x3200 ; // sio_hw->gpio_clr = pin_off; 0x3200
// The address of the DMA1 channel read
int DMA1_addr = (DMA_BASE + DMA_CH1_READ_ADDR_OFFSET) ;
//int DMA1_addr = dma_hw->ch[1].read_addr ;
int * DMA1_addr_ptr = &DMA1_addr ;

// a pointer for jump tests
// note that these are evaluated by the DMA blocks at runtime
// based on assignments at compile time.
// Each is used to uniquely specify the target of a jump operation
int * jump_skip_zero_addr ;
int * jump_compare_zero_addr ;
int * jump_compare_end_addr ;

// Define a set of blocks in the DMA_blocks array
// set up the fetch DMA module (DMA0), to start the machine
void DMA_machine(void){
    // init block count
    N = 0 ; 
    // pacing timer 100 KHz set denom to 125000000/100000 = 1250
    // 200 KHz  is 625 counter divide
    // 500 KHz is 250 
    dma_timer_set_fraction ( 3, 1, 1250) ;

    // enable sniffer add operation for DMA channel 1 
    // this locks sniffer to chan 1, but DMA1 has to also enable on a per-block basis
    dma_sniffer_enable(1, sniffer_add, true);
	
    // ================================================
    // define blocks to be executed
    // Build the DMA program
    // ================================================
    // build_block(read_addr, write_addr, count, ctrl)
    // ================================================
    // Each of these DMA control blocks will be loaded by
    // the DMA0 fetch channel into DMA1, then the data will be moved
    // -then the DMA0 channel is fixed and restarted to fetch the next block
    //================= ========== 
    // Pulse on
    build_block(&pin_on,  &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    build_block(&pin_off,  &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    //=================  ==========
    // === add two variables by transport-triggered operation in sniff reg
    // dma_sniffer_ set to add
    build_block(&sniff_calc_mask,  DMA_SNIFF_CTRL_SET, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    //
    // === load a var to sniff data reg: dma_hw->sniff_data
    build_block(&dma_var_A,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // == pass another var thru the sniffer to the bit_bucket
     build_block(&dma_var_B,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // == store the sniff data reg back to var_2
    build_block( &dma_hw->sniff_data, &dma_var_C, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    //================= ============ 
    // === subtract two variables (A-B) by transport-triggered operation in sniff reg
    // dma_sniffer_  already set to add
    // Compute A-B
    // === load B var to sniff data reg: dma_hw->sniff_data
    build_block(&dma_var_B,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // invert sniffer data bits using sniffer xor write
    build_block(&sniff_xor_mask,  DMA_SNIFF_DATA_XOR, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // add 1 to form two's complement (i.e. -B)
     build_block(&dma_one,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // == add A by passing A var thru the sniffer to the bit_bucket
     build_block(&dma_var_A,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // == store the sniff data reg back to var_E = A-B
    build_block( &dma_hw->sniff_data, &dma_var_E, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    
    //================= ============= 
    // Set up a 'SKIP NEG   which will skip some number of blocks if true
    // register absolute value MUST be less than 2^28
    // load var to sniff_data with BSWAP (sign bit now in low bits)
    build_block(&dma_var_A,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | BSWAP) ;
    // if abs(var)<pow(2,28) then all 4 bits of the low byte are the same, and are the sign
    // mask all bits except bit 4, for a one block skip (or 5 for a 2 block skip)
    // this generates a zero if input is positive, and 16 if negative (or 32 for 2 block)
    build_block(&sniff_offset32_mask,  DMA_SNIFF_DATA_CLR, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // add to address for zero skip, then form DMA0 write
    build_block(&jump_compare_zero_addr,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // move sniffer data to read addr of DMA0 to force jump to  new location
    // sniffer contains zero_jump_address + possible offset to two_jump
    // push block address to DMA0 block    &dma_hw->sniff_data
    build_block(&dma_hw->sniff_data, &dma_hw->ch[0].read_addr, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    // TARGET for zero offset (positive or zero) write a one, then jump OVER the write-zero
    jump_compare_zero_addr = current_block_addr ;
    build_block(&dma_zero, &dma_var_G, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;   
    // now jump over the negative target  
    build_block(&jump_compare_end_addr, &dma_hw->ch[0].read_addr, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;  

    // TARGET for skip (negative)- two block offset from jump_compare_zero_addr
    build_block(&dma_one, &dma_var_G, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;
    // -- just keep going
    // nop to jump to from zero target stanza
    jump_compare_end_addr = current_block_addr ;
    build_block(&bit_bucket, &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;
    // ========================
   
    // === shift left by transport-triggered operation in sniff reg
    // === load a var to sniff data reg: dma_hw->sniff_data
    build_block(&dma_var_A,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // == pass var thru the sniffer to the bit_bucket  &bit_bucket
     build_block(&dma_var_A, &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // == store the sniff data reg back to var D
    build_block( &dma_hw->sniff_data, &dma_var_D, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    //========================== 
    // === logical shift right by transport-triggered operation in sniff reg
    //     revers bit order; shift left; revers bit order
    // === load a var to sniff data reg: dma_hw->sniff_data
    build_block(&dma_var_A,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // reverse bit order by reading back to F with bitrev turned on
    build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_SET, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    build_block( &dma_hw->sniff_data, &dma_var_F, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // read bitrev F back into sniffer
    build_block(&dma_var_F,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN ) ;
    // == pass var thru the sniffer to the bit_bucket  to double the bit-reveresed data
    build_block(&dma_var_F, &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // reverse bit order back to oroginal order, but shifted
    // == store the sniff data reg back to var F
    build_block( &dma_hw->sniff_data, &dma_var_F, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
     // !!turn off reverse flag!!
     build_block(&sniff_rev_mask,  DMA_SNIFF_CTRL_CLR, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    //============================== 
    // === conditional skip based on small integer values
    // the dma_flag variable can take only 0 - 3 values
    // == read flag to sniffer 
    build_block(&dma_zero, &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN ) ;
    // mult dma_flag by 16
    build_block(&dma_flag, &bit_bucket, 16, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // form target block address by adding jump_zero address;
    // see below for location of jump_zero_addr
    build_block( &jump_skip_zero_addr,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // move sniffer data to read addr of DMA0 to force next read from new location
    // sniffer contains zero_jump_address + offset to one_jump
    // push block address to DMA0 block    
    build_block(&dma_hw->sniff_data, &dma_hw->ch[0].read_addr, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;
    
    // === TARGET if dma_flag == 0
    jump_skip_zero_addr = current_block_addr ;
    // === set pin
    build_block(&pin_on, &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;    
    // TARGET if dma_flag == 1
    // === set pin
    build_block(&pin_on, &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ; 
    // TARGET if dma_flag==2
    // == === clear the pin
    build_block(&pin_off,  &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    //================= ======= 

    // = ===  unconditional jump to start of program
    // push the DMA_blocks[0] address into the program counter (DMA0 read pointer)
    // !!NOTE that this block throttles the machine to the frequency of Timer 3 !!
    // set here to 200 KHz. Since this set of blocks takes about 2.5 uSec to execute,
    // the channels are used about 25% of the time.
    // To run at full DMA speed, change to DREQ_FORCE
    build_block(&DMA_blocks_addr, &dma_hw->ch[0].read_addr, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_DMA_TIMER3) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    // === END OF DMA PROGRAM ===
    //
    // ======================================================
    // execution machine -- no part of program below here!
    // ======================================================
    // === DMA2 module to reset write address of dma0
    // always set to  DMA1 write address
    dma_channel_config c2 = dma_channel_get_default_config(2);
    channel_config_set_transfer_data_size(&c2, DMA_SIZE_32);
    channel_config_set_read_increment(&c2, false);
    channel_config_set_write_increment(&c2, false);
    channel_config_set_irq_quiet(&c2, true);
    channel_config_set_enable(&c2, true); 
    channel_config_set_chain_to(&c2, 0) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c2,  DREQ_FORCE);
    //
    dma_channel_configure(2, &c2, 
        &dma_hw->ch[0].write_addr , // reset the chan 0 to write to channel 1
        DMA1_addr_ptr ,  // read_addr, pointer to address of channel 1
        1, // one words per DMA block
        false) ; // triggered to start machine running

    // === The DMA0 fetch module
    // this is the program counter and fetch unit
    // when this code is executed it starts the DMAcpu machine!
    // NOTE that this moddule could be paced by a timer or other TREQ 
    //    for DDS, or other operations
   dma_channel_config c0 = dma_channel_get_default_config(0);
    channel_config_set_transfer_data_size(&c0, DMA_SIZE_32);
    channel_config_set_read_increment(&c0, true);
    channel_config_set_write_increment(&c0, true);
    channel_config_set_irq_quiet(&c0, true);
    channel_config_set_enable(&c0, true); 
    channel_config_set_chain_to(&c0, 1) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c0,  DREQ_FORCE);
    //
    dma_channel_configure(0, &c0, 
        &dma_hw->ch[1].read_addr , // write to dma channel 1
        DMA_blocks_addr ,  // read_addr, DMA blocks list
        4, // four words per DMA block
        true) ; // triggered to start machine running
}

// ==================================================
// === user's serial input thread
// ==================================================
// serial_read an serial_write do not block any thread
// except this one
static PT_THREAD (protothread_serial(struct pt *pt))
{
    PT_BEGIN(pt);
      static int test_in1, test_in2, sum ;
      //
      while(1) {
        // print prompt
        sprintf(pt_serial_out_buffer, "enter A, B, flag: ");
        // spawn a thread to do the non-blocking write
        serial_write ;

        // spawn a thread to do the non-blocking serial read
         serial_read ;
        // convert input string to number
        sscanf(pt_serial_in_buffer,"%d %d %d", &dma_var_A, &dma_var_B, &dma_flag) ;
        // 
        if((dma_flag<0) || (dma_flag>3)) dma_flag = 0 ;

       PT_YIELD_usec(50) ;
        printf("A+B=%d  A-B=%d  (A<<1)=%d  (A>>1)=%d  (A<0)=%d\n\r", 
            dma_var_C, dma_var_E, dma_var_D, dma_var_F, dma_var_G) ;
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // timer thread

// ==================================================
// === toggle25 thread 
// ==================================================
// the on-board LED blinks
static PT_THREAD (protothread_toggle25(struct pt *pt))
{
    PT_BEGIN(pt);
    static bool LED_state = false ;
    
     // set up LED p25 to blink
     gpio_init(25) ;	
     gpio_set_dir(25, GPIO_OUT) ;
     gpio_put(25, true);
      while(1) {
        // yield time 0.1 second
        PT_YIELD_usec(100000) ;
        // toggle the LED on PICO
        LED_state = LED_state? false : true ;
        gpio_put(25, LED_state);
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // blink thread


// ========================================
// === core 1 main -- started in main below
// ========================================
void core1_main(){ 
  
  //  === add threads  ====================
  // for core 1
  // none
  // === initalize the scheduler ==========
  pt_schedule_start ;
  // NEVER exits
  // ======================================
}

// ========================================
// === core 0 main
// ========================================
int main(){
  // start the serial i/o
  stdio_init_all() ;
  // announce the threader version on system reset
  printf("\n\rProtothreads RP2040 v1.1 two-core\n\r");

/*
// Initialize SPI channel (channel, baud rate set to 20MHz)
// connected to spi DAC
spi_init(SPI_PORT, 20000000) ;
// Format (channel, data bits per transfer, polarity, phase, order)
spi_set_format(SPI_PORT, 16, 0, 0, 0);
// Map SPI signals to GPIO ports
//gpio_set_function(PIN_MISO, GPIO_FUNC_SPI);
gpio_set_function(PIN_SCK, GPIO_FUNC_SPI);
gpio_set_function(PIN_MOSI, GPIO_FUNC_SPI);
gpio_set_function(PIN_CS, GPIO_FUNC_SPI) ;
*/
  
  // turn on the DMA coprocessor   
  DMA_machine() ;

  // start core 1 threads
  //multicore_reset_core1();
  //multicore_launch_core1(&core1_main);

  // === config threads ========================
  // for core 0
  pt_add_thread(protothread_serial);
  pt_add_thread(protothread_toggle25);
  //
  // === initalize the scheduler ===============
  pt_schedule_start ;
  // NEVER exits
  // ===========================================
} // end main
///////////
// end ////
///////////