 /*
 This version executes timed output of sine wave to SPI DAC
 to make DDS. 
 Algorithm:
 dds_accum += dds_inc (32 bits)
 high byte of dds_accum is index into sine table
 Use DMA BSWAP to move it to low byte, then clear uper bytes
 add pointer to sine_taable
 2-byte transfer from sine table to SPI_data
 where:
 dds_inc = Fout * pow(2,32 )/ Fs
 with Fs = 2e5 here.

 --- DMA machine (DMAcpu) --- DMA-CpU
 This is a fetch-execute cpu where the 'fetch' function is done by one DMA block, which
 loads DMA control block images from RAM into another 'execution' DMA block. The 'program' which is
 loaded consists of a carefully crafted series of memory copy commands which together
 act as a general purpose computer. The design is made easier by several transport-
 triggered actions in the DMA subsystem. These include an adder in the 'channel sniffer'
 and atomic SET/CLEAR/XOR write functions on all SFRs, including the sniffer.

 No ARM cpu resources are used when the DMAcpu is running. Other DMA channels can
 be used as needed, including to build another copy of the DMAcpu.

# ====================================
# === Register write functions =======
# Each peripheral register block is allocated 4kB of address space,
# with registers accessed using one of 4 methods,selected by
# address decode.
# Addr + 0x0000 : normal read write access
# Addr + 0x1000 : atomic XOR on write
# Addr + 0x2000 : atomic bitmask set on write
# Addr + 0x3000 : atomic bitmask clear on write

======================================
DMA channels can only get to GPIO through PAD  override registers
//force enable output and output 1
iobank0_hw->io[2].ctrl = 0x3300 ;
// force enable and output zero
iobank0_hw->io[2].ctrl = 0x3200 ;

=== DAC ===
GPIO 5 (pin 7) Chip select -- spi hardware runs this without user code
GPIO 6 (pin 9) SCK/spi0_sclk
GPIO 7 (pin 10) MOSI/spi0_tx
3.3v (pin 36) -> VCC on DAC 
GND (pin 3)  -> GND on DAC

*/

#include "hardware/gpio.h"
#include "hardware/timer.h"
#include "hardware/spi.h"
#include "pico/stdlib.h"
#include "hardware/uart.h"
#include "stdio.h"
#include <string.h>
#include <pico/multicore.h>
#include "hardware/sync.h"
#include "hardware/dma.h"
#include "hardware/gpio.h"
#include "math.h"
#include "hardware/structs/iobank0.h"

// ==========================================
// === protothreads setup
// ==========================================
// protothreads header
#include "pt_cornell_rp2040_v1.1.h"

// === global thread+DMA communicaiton
// must be volatile because of DMA modification
// a zero load variable -- dont change
volatile int dma_var_0 = 0 ;
// a one-load variable -- dont change
volatile int dma_var_1 = 1 ;
// signal from cpu
volatile int dma_flag = 0 ;
// DDS globals
unsigned int dds_accum = 0, dds_inc = 400 * 4294967296 / 200000 ;
//mask to clear sniffer bits
unsigned int clear_high_bytes = 0xffffff00 ;
float frequency = 400, Fs = 2e5;
short sine_table[256] ;
short * sine_table_addr = &sine_table[0] ;
// constant to tell SPI DAC what to do
// prepend to each 12-bit sample
#define DAC_config_chan_A 0b0011000000000000

// ========================================
// === spi setup
// =======================================
//SPI configurations
#define PIN_CS   5
#define PIN_SCK  6
#define PIN_MOSI 7
#define SPI_PORT spi0

// ==========================================
// === DMA machine setup
// ==========================================
//  DMA control block size in words:
#define length_of_block 4
// the main DMA block program list
#define max_blocks 100
int DMA_blocks[max_blocks * length_of_block];
// base address of DMA block list being defined
 int * DMA_blocks_addr = &DMA_blocks[0] ;
// counter for the current block to create
int N = 0;
// /dev/null
int bit_bucket ;

// A macro to assign a sequence label value
// just copies the next block count to a label name  e.g. label1=branch_label() ;
// inserting this statment gives a name to a jump-target
#define branch_label() N 
// convert the label variable to an absolute address
// DMA_blocks is a pointer to the array of block images
#define branch_label_to_addr(label) ( DMA_blocks + (label)*length_of_block )
// address of current block
#define current_block_addr (&DMA_blocks[4*(N)])
// address of the next block
#define next_block_addr (&DMA_blocks[4*(N+1)])
// address of the  block 2 ahead
#define next_block2_addr (&DMA_blocks[4*(N+2)])
// address of the  block 3 ahead
#define next_block3_addr (&DMA_blocks[4*(N+3)])
// address of the  block 4 ahead
#define next_block4_addr (&DMA_blocks[4*(N+4)])
// adderss of arbitrary block
#define block_addr(N)    (&DMA_blocks[4*(N)])

// macro to build a DMA block
// inputs: read addr, write addr, count, ctrl word (4 bytes each)
// puts a 16 byte channel control block image in the DMA_blocks array at the current block count, N, 
// then increments the block count N
// read_addr is put in array location N*length_of_block,
// write address is put in array location N*length_of_block+1,
#define build_block(read_addr, write_addr, count, ctrl) \
    do { \
        DMA_blocks[4*N] = (int)read_addr; \
        DMA_blocks[4*N+1] = (int)write_addr; \
        DMA_blocks[4*N+2] = count; \
        DMA_blocks[4*N+3] = ctrl; \
        N++ ; \
    } while(0)

// macros for defining the DMA CNTL word bit
// This duplicates some of the SDK 
// but i like having it here
// data_width is 0==byte 1==short 2==int
#define DMA_DATA_WIDTH(data_width) ((data_width & 0x03)<<2) 
// give this channel more access if several channels aare on
#define DMA_HIGH_PRI  (1<<1) 
// turn on the channel
#define DMA_EN  1  
// increment or keep constant wrrite nd read addr
// useful for peripheril write/read
#define DMA_WR_INC  (1<<5) 
#define DMA_RD_INC  (1<<4) 
// enable this channel for sniffer
#define SNIFF_EN (1<<23)
// reverse order of bytes when channel is transmitting
#define BSWAP (1<<22)
//turn off channel done IRQ
#define DMA_IRQ_QUIET  (1<<21)
// More tirgger info page 114
// 0x0 to 0x3a == select DREQ n as TREQ from table above
// 0x3b == Select Timer 0 as TREQ
// 0x3c == Select Timer 1 as TREQ
// 0x3d == Select Timer 2 as TREQ (Optional)
// 0x3e == Select Timer 3 as TREQ (Optional)
// 0x3f == Permanent request, for unpaced transfers.
// bits 15:20 trigger request source
#define DMA_TREQ(trigger_source)  ((trigger_source & 0x3f)<<15) 
// When this channel completes, it will trigger the channel
// indicated by CHAIN_TO. Disable by setting CHAIN_TO = (this channel).
// bits 11:14 next chnnel #
#define DMA_CHAIN_TO(next_ch) ((next_ch & 0x0f)<<11) 

// DMA sniffer atomic write operations 
// used to simulate an accumulator logic operations
// and add and CRC32
// sniffer can also compute CRC, parity, and bit-inversion
#define sniffer_add 0x0f
#define sniffer_crc32 0x00
#define DMA_SNIFF_DATA_SET   (0x2438 + 0x50000000)
#define DMA_SNIFF_DATA_XOR   (0x1438 + 0x50000000)  
#define DMA_SNIFF_DATA_CLR   (0x3438 + 0x50000000)
// logic:
// load A then set using B as mask implements A OR  B
// load B then CLR bits using NOT(A) as a mask implements A AND B
// load A then XOR bits using B as mask implements A XOR B
// load A then XOR bits with 0xffffffff implements NOT A

// init the machine and define some global constants
// gpio 2
int pin_on = 0x3300 ; // sio_hw->gpio_set = pin_on
int pin_off = 0x3200 ; // sio_hw->gpio_clr = pin_off; 0x3200
// The address of the DMA1 channel read
int DMA1_addr = (DMA_BASE + DMA_CH1_READ_ADDR_OFFSET) ;
//int DMA1_addr = dma_hw->ch[1].read_addr ;
int * DMA1_addr_ptr = &DMA1_addr ;

// Define a set of blocks in the DMA_blocks array
// set up the fetch DMA module (DMA0), to start the machine
void DMA_machine(void){
    // init block count
    N = 0 ; 

    // pacing timer 100 KHz set denom to 125000000/100000 = 1250
    // 200 KHz 625 DDS takes 1.3 uSec every 5 uSec, so 25%
    // 500 KHz 250 will run at this speed but accuracy drops a little
    dma_timer_set_fraction ( 3, 1, 625) ;

    // enable sniffer add operation for DMA channel 1 
    // -- done once at compile time
    // this lock sniffer to chan 1, but DMA1 has to enable on a per-block basis
    dma_sniffer_enable(1, sniffer_add, true);
		
    // ================================================
    // define blocks to be executed
    // Build the DMA program
    // ================================================
    // build_block(read_addr, write_addr, count, ctrl)
    // ================================================
    // Each of these DMA control blocks will be loaded by
    // the DMA0 fetch channel into DMA1, then the data will be moved
    // -then the DMA0 channel is fixed and restarted to fetch the next block

    // Pulse on
    // == === set the pin --
    build_block(&pin_on, &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    // === add  dds_accum and dds_inc by transport-triggered operation in sniff reg
    build_block(&dds_accum,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
    // pass dds_inc thru the sniffer to the bit_bucket
     build_block(&dds_inc,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // store the sniff data reg back to dds_accum
    build_block( &dma_hw->sniff_data, &dds_accum, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;

    // form pointer to current sine table entry
    // reload dds_accum BUT byte reversed! see BSWAP
    build_block(&dds_accum,  &dma_hw->sniff_data, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | BSWAP) ;
    // clear high bytes -- leave low byte alone
     build_block(&clear_high_bytes, DMA_SNIFF_DATA_CLR, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN ) ;
    // mult by 2 for 'short' array pointer by addding sniffer to itself
    build_block( &dma_hw->sniff_data, &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
    // add to sine table base address
    build_block(&sine_table_addr,  &bit_bucket, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN | SNIFF_EN) ;
   
    // move table entry addr to next block read addr 
    build_block(&dma_hw->sniff_data,  next_block_addr, 1,
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN ) ;
        
    // move sine table to spi -- spi0_hw->dr
    // NOTE that the CS line is driven automatically by this write
    build_block(sine_table_addr,  &spi0_hw->dr, 1,
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_16) | DMA_EN ) ;
    
    // == === clear the pin
    build_block(&pin_off,  &iobank0_hw->io[2].ctrl, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_EN) ;
        
        // = ===  unconditional jump to start of program
    // push the DMA_blocks[0] address into the program counter (DMA0 read pointer)
    // !!NOTE that this block throttles the machine to the frequency of Timer 3 !!
    // set here to 100 KHz. Since this set of blocks takes about 2.5 uSec to execute,
    // the channels are used about 25% of the time.
    // To run at full DMA speed, change to DREQ_FORCE
    build_block(&DMA_blocks_addr, &dma_hw->ch[0].read_addr, 1, 
        DMA_CHAIN_TO(2) | DMA_TREQ( DREQ_DMA_TIMER3) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    // === END OF DMA PROGRAM ===
    //
    // ======================================================
    // execution machine -- no part of program below here!
    // ======================================================
    // === DMA2 module to reset write address of dma0
    // always set to  DMA1 write address
    dma_channel_config c2 = dma_channel_get_default_config(2);
    channel_config_set_transfer_data_size(&c2, DMA_SIZE_32);
    channel_config_set_read_increment(&c2, false);
    channel_config_set_write_increment(&c2, false);
    channel_config_set_irq_quiet(&c2, true);
    channel_config_set_enable(&c2, true); 
    channel_config_set_chain_to(&c2, 0) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c2,  DREQ_FORCE);
    //
    dma_channel_configure(2, &c2, 
        &dma_hw->ch[0].write_addr , // reset the chan 0 to write to channel 1
        DMA1_addr_ptr ,  // read_addr, pointer to address of channel 1
        1, // one words per DMA block
        false) ; // triggered to start machine running

    // === The DMA0 fetch module
    // this is the program counter and fetch unit
    // when this code is executed it starts the DMAcpu machine!
    // NOTE that this moddule could be paced by a timer or other TREQ 
    //    for DDS, or other operations
   dma_channel_config c0 = dma_channel_get_default_config(0);
    channel_config_set_transfer_data_size(&c0, DMA_SIZE_32);
    channel_config_set_read_increment(&c0, true);
    channel_config_set_write_increment(&c0, true);
    channel_config_set_irq_quiet(&c0, true);
    channel_config_set_enable(&c0, true); 
    channel_config_set_chain_to(&c0, 1) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c0,  DREQ_FORCE);
    //
    dma_channel_configure(0, &c0, 
        &dma_hw->ch[1].read_addr , // write to dma channel 1
        DMA_blocks_addr ,  // read_addr, DMA blocks list
        4, // four words per DMA block
        true) ; // triggered to start machine running
}

// ==================================================
// === user's serial input thread
// ==================================================
// serial_read an serial_write do not block any thread
// except this one
static PT_THREAD (protothread_serial(struct pt *pt))
{
    PT_BEGIN(pt);
      static int test_in1, test_in2, sum ;
      //
      while(1) {
        // print prompt
        sprintf(pt_serial_out_buffer, "Fout: ");
        // spawn a thread to do the non-blocking write
        serial_write ;

        // spawn a thread to do the non-blocking serial read
         serial_read ;
        // convert input string to number
        sscanf(pt_serial_in_buffer,"%f ", &frequency) ;
        // compute the DDS increment for the users frequency
        dds_inc = (int) frequency * pow(2,32) / Fs ;
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // timer thread

// ==================================================
// === toggle25 thread 
// ==================================================
// the on-board LED blinks
static PT_THREAD (protothread_toggle25(struct pt *pt))
{
    PT_BEGIN(pt);
    static bool LED_state = false ;
    
     // set up LED p25 to blink
     gpio_init(25) ;	
     gpio_set_dir(25, GPIO_OUT) ;
     gpio_put(25, true);
      while(1) {
        // yield time 0.1 second
        PT_YIELD_usec(100000) ;
        // toggle the LED on PICO
        LED_state = LED_state? false : true ;
        gpio_put(25, LED_state);
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // blink thread


// ========================================
// === core 1 main -- started in main below
// ========================================
void core1_main(){ 
  
  //  === add threads  ====================
  // for core 1
  // none
  // === initalize the scheduler ==========
  pt_schedule_start ;
  // NEVER exits
  // ======================================
}

// ========================================
// === core 0 main
// ========================================
int main(){
  // start the serial i/o
  stdio_init_all() ;
  // announce the threader version on system reset
  printf("\n\rProtothreads RP2040 v1.1 two-core\n\r");

// Initialize SPI channel (channel, baud rate set to 20MHz)
// connected to spi DAC
spi_init(SPI_PORT, 20000000) ;
// Format (channel, data bits per transfer, polarity, phase, order)
spi_set_format(SPI_PORT, 16, 0, 0, 0);
// Map SPI signals to GPIO ports
//gpio_set_function(PIN_MISO, GPIO_FUNC_SPI);
gpio_set_function(PIN_SCK, GPIO_FUNC_SPI);
gpio_set_function(PIN_MOSI, GPIO_FUNC_SPI);
gpio_set_function(PIN_CS, GPIO_FUNC_SPI) ;

  // dds table 12 bit values
  // with high order bits set to DAC control word
  int i = 0 ;
  for(int i=0; i<256; i++) {
    // sine table is in naural +1/-1 range
    sine_table[i] = (int)(2000 + 2000 * sin(2*3.1416*i/256)) | DAC_config_chan_A ;
  }

  // turn on the coprocessor   
  DMA_machine() ;

  // start core 1 threads
  //multicore_reset_core1();
  //multicore_launch_core1(&core1_main);

  // === config threads ========================
  // for core 0
  pt_add_thread(protothread_serial);
  pt_add_thread(protothread_toggle25);
  //
  // === initalize the scheduler ===============
  pt_schedule_start ;
  // NEVER exits
  // ===========================================
} // end main
///////////
// end ////
///////////