/**
 * This version ncorporates the DMAcpu code to compute very fast random numbers
 * 
 * Hunter Adams (vha3@cornell.edu)
 * converted to 320x240 with 256 colors by Bruce; brue.land@cornell.edu
 * 
 * HARDWARE CONNECTIONS
 *  - GPIO 16 ---> VGA Hsync
 *  - GPIO 17 ---> VGA Vsync
 * 
 *  - GPIO 08 ---> 330 ohm resistor ---> VGA Blue lo-bit |__ both wired to 150 ohm to ground 
 *  - GPIO 09 ---> 220 ohm resistor ---> VGA Blue hi_bit |   and to VGA blue
 * 
 *  - GPIO 10 ---> 1000 ohm resistor ---> VGA Green lo-bit |__ three wired to 100 ohm to ground
 *  - GPIO 11 ---> 680 ohm resistor ---> VGA Green mid_bit |   and to VGA Green
 *  - GPIO 12 ---> 330 ohm resistor ---> VGA Green hi_bit  |   
 * 
 *  - GPIO 13 ---> 1000 ohm resistor ---> VGA Red lo-bit |__ three wired to 100 ohm to ground
 *  - GPIO 14 ---> 680 ohm resistor ---> VGA Red mid_bit |   and to VGA red
 *  - GPIO 15 ---> 330 ohm resistor ---> VGA Red hi_bit  |   
 * 
 *  - RP2040 GND ---> VGA GND
 *
 * RESOURCES USED
 *  - PIO state machines 0 to 3 on PIO instance 0
 *  - DMA channels 0, 1, 2, 3 data send to two PIO
 *  - 76.8 kBytes of RAM (for pixel color data)
 * color encoding: bits 7:5 red; 4:2 green; 1:0 blue
 *
 * Protothreads v1.1.1
 * graphics demo thread
 * serial thread to set the distribution parameters///
 * the usual blinky thread for a hearbeat
 */
// ==========================================
// === VGA graphics library
// ==========================================
#include "vga256_graphics.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h> 
#include <math.h>
#include "pico/stdlib.h"
#include "hardware/pio.h"
#include "hardware/dma.h"

//===========================================
// === DMAcpu
//===========================================
#include "hardware/gpio.h"
#include "hardware/timer.h"
#include "hardware/spi.h"
#include "pico/stdlib.h"
#include "hardware/uart.h"
#include "hardware/gpio.h"
#include "hardware/structs/iobank0.h"
#include "hardware/regs/rosc.h"

// ==========================================
// === protothreads globals
// ==========================================
#include "hardware/sync.h"
#include "hardware/timer.h"
#include "pico/multicore.h"
#include "string.h"
#include "math.h"
// protothreads header
#include "pt_cornell_rp2040_v1_1_1.h"
// interactive color
int new_value = false ;

// ==========================================
// === DMA machine setup
// ==========================================
// === global thread + DMA communicaiton
// must be volatile because of DMA modification
// a zero load variable -- dont change
volatile int dma_zero = 0 ;
// a one-load variable -- dont change
volatile int dma_one = 1 ;
volatile int dma_neg_one = -1 ;

// noise vars
volatile int dma_noise_temp = 0 ;
volatile int dma_rand_out = 0;
volatile int dma_sniff_temp = 0 ;

// signal to user thead
volatile int dma_flag = 0 ;

// ========================================
// === spi setup 
// =======================================
//SPI configurations
#define PIN_CS   5
#define PIN_SCK  6
#define PIN_MOSI 7
#define SPI_PORT spi0

// constant to tell SPI DAC what to do
// prepend to each 12-bit sample
#define DAC_config_chan_A 0b0011000000000000

// ========================================
// === DMAcpu setup 
// =======================================
//  DMA control block size in words:
#define length_of_block 4
// the main DMA block program list
#define max_blocks 100
int DMA_blocks[max_blocks * length_of_block];
// base address of DMA block list being defined
 int * DMA_blocks_addr = &DMA_blocks[0] ;

// counter for the current block to create
int N = 0;
// /dev/null - a data sink
int bit_bucket ;

// make it easier to change channels
// so that the machine will work with other software, like video
#define fetch_chan 9
#define execute_chan 10
#define fix_chan 11

// address of current block
#define current_block_addr (&DMA_blocks[4*(N)])
// address of the next block
#define next_block_addr (&DMA_blocks[4*(N+1)])
// address of the  block 2 ahead
#define next_block2_addr (&DMA_blocks[4*(N+2)])
// adderss of arbitrary block
#define block_addr(N)    (&DMA_blocks[4*(N)])

// macro to build a DMA block
// inputs: read addr, write addr, count, ctrl word (4 bytes each)
// puts a 16 byte channel control block image in the DMA_blocks array at the current block count, N, 
// then increments the block count N
// read_addr is put in array location N*length_of_block,
// write address is put in array location N*length_of_block+1,
#define build_block(read_addr, write_addr, count, ctrl) \
    do { \
        DMA_blocks[4*N] = (int)read_addr; \
        DMA_blocks[4*N+1] = (int)write_addr; \
        DMA_blocks[4*N+2] = count; \
        DMA_blocks[4*N+3] = ctrl; \
        N++ ; \
    } while(0)

// macros for defining the DMA CNTL word bits
// --This duplicates some of the SDK --
// but i like having it here
// data_width is 0==byte 1==short 2==int
#define DMA_DATA_WIDTH(data_width) ((data_width & 0x03)<<2) 
// give this channel more access if several channels aare on
#define DMA_HIGH_PRI  (1<<1) 
// turn on the channel
#define DMA_EN  1  
// increment or keep constant wrrite nd read addr
// useful for peripheril write/read
#define DMA_WR_INC  (1<<5) 
#define DMA_RD_INC  (1<<4) 
// enable this channel for sniffer
#define SNIFF_EN (1<<23)
// reverse order of bytes when channel is transmitting
#define BSWAP (1<<22)
//turn off channel done IRQ
#define DMA_IRQ_QUIET  (1<<21)
// More tirgger info page 114
// 0x0 to 0x3a == select DREQ n as TREQ from table above
// 0x3b == Select Timer 0 as TREQ
// 0x3c == Select Timer 1 as TREQ
// 0x3d == Select Timer 2 as TREQ (Optional)
// 0x3e == Select Timer 3 as TREQ (Optional)
// 0x3f == Permanent request, for unpaced transfers.
// bits 15:20 trigger request source
#define DMA_TREQ(trigger_source)  ((trigger_source & 0x3f)<<15) 
// When this channel completes, it will trigger the channel
// indicated by CHAIN_TO. Disable by setting CHAIN_TO = (this channel).
// bits 11:14 next chnnel #
#define DMA_CHAIN_TO(next_ch) ((next_ch & 0x0f)<<11) 
#define STANDARD_CTRL (DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN)

// DMA sniffer atomic write operations 
// used to simulate an accumulator logic operations
// sniffer can also compute add, CRC, parity, and bit-inversion
#define sniffer_add 0x0f
#define sniffer_crc32 0x00
#define DMA_SNIFF_DATA_SET   (0x2438 + 0x50000000)
#define DMA_SNIFF_DATA_XOR   (0x1438 + 0x50000000)  
#define DMA_SNIFF_DATA_CLR   (0x3438 + 0x50000000)
#define DMA_SNIFF_CTRL_SET   (0x2434 + 0x50000000)
#define DMA_SNIFF_CTRL_XOR   (0x1434 + 0x50000000)  
#define DMA_SNIFF_CTRL_CLR   (0x3434 + 0x50000000)
// logic:
// load A then set using B as mask implements A OR  B
// load B then CLR bits using NOT(A) as a mask implements A AND B
// load A then XOR bits using B as mask implements A XOR B
// load A then XOR bits with 0xffffffff implements NOT A

// default will be add -- BUT function can be controlled by DMA machine
// these two sniffer congtrol options invert or reveses the bits when WRITING sniffer to 
// some location
#define OUT_INV (1<<11)
#define OUT_REV (1<<10)
// calc field is 4 bits 5:8: all bits set is add, zero is CRC32
#define CALC_ADD (0xf<<5)
#define CALC_CRC (0x0<<5)
int sniff_inv_mask = OUT_INV ;
int sniff_rev_mask = OUT_REV ;
int sniff_calc_mask = 0xf << 5 ;
// invert all bits when using DMA_SNIFF_DATA_XOR
int sniff_xor_mask = 0xffffffff ;
// clear all but a few bits to generate offsets for jumping
// after a BSWAP puts the sign bits into bits 7:4
int sniff_offset16_mask = 0xffffffef ;
int sniff_offset32_mask = 0xffffffdf ;
int sniff_offset48_mask = 0xffffffcf ;
int sniff_dac_data_mask = 0xfffff000 ;
int dac_config_mask = DAC_config_chan_A;

// define gpio2 direct write registers
int pin_on = 0x3300 ; // sio_hw->gpio_set = pin_on
int pin_off = 0x3200 ; // sio_hw->gpio_clr = pin_off; 0x3200

// The address of the execute channel read
int DMA_execute_addr = (DMA_BASE + execute_chan * DMA_CH1_READ_ADDR_OFFSET) ;
// and point to it
int * DMA_execute_addr_ptr = &DMA_execute_addr ;


// ==========================================
// === Ring Osc random bit RNG
// setup for higher speed oscillator
// ==========================================
volatile uint32_t *rnd_reg = (uint32_t *)(ROSC_BASE + ROSC_RANDOMBIT_OFFSET);
void rosc_setup(void){
  volatile uint32_t *rosc_div = (uint32_t *)(ROSC_BASE + ROSC_DIV_OFFSET) ;
  volatile uint32_t *rosc_ctl = (uint32_t *)(ROSC_BASE + ROSC_CTRL_OFFSET) ;
  volatile uint32_t *rosc_freqA = (uint32_t *)(ROSC_BASE + ROSC_FREQA_OFFSET) ;
  volatile uint32_t *rosc_freqB = (uint32_t *)(ROSC_BASE + ROSC_FREQB_OFFSET) ;
  
  // set divider to one for frequency measurement
  *rosc_div = ROSC_DIV_VALUE_PASS + 1 ;
  // speed up the ROSC so more cycles between reads
  // (dont use ROSC_CTRL_FREQ_RANGE_VALUE_TOOHIGH)
  // Measured at 241 MHz with theses settings
  *rosc_ctl =  ROSC_CTRL_FREQ_RANGE_VALUE_HIGH ;// | ROSC_CTRL_ENABLE_VALUE_ENABLE;
  *rosc_freqA = (ROSC_FREQA_PASSWD_VALUE_PASS<<16) | 0xffff ;
  *rosc_freqB = (ROSC_FREQB_PASSWD_VALUE_PASS<<16) | 0xffff ;
}

// ==========================================================
// basic random number gen with Von Neumann extractor
// and a small delay in the extractor
// AND one pass through rand() function
// https://en.wikipedia.org/wiki/Randomness_extractor
// ==========================================================

uint32_t rand_rosc_VN(void){
    int k, random ;
    int random_bit1, random_bit2 ;
    volatile uint32_t *rnd_reg = (uint32_t *)(ROSC_BASE + ROSC_RANDOMBIT_OFFSET);
    volatile uint32_t *rosc_ctl = (uint32_t *)(ROSC_BASE + ROSC_CTRL_OFFSET) ;
    
    for(k=0;k<32;k++){
      // von Neumann bit extractor         
      while(1){
        //extractor_count++ ;
        random_bit1=0x00000001 & (*rnd_reg);
        //  a small delay decorrelates the bits
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;
        asm("nop") ; asm("nop") ;      
        random_bit2=0x00000001 & (*rnd_reg);
        // if the two are diferent, use the first one
        if(random_bit1!=random_bit2) break;
      }  
      // build the 32 bit sample
      random=(random << 1) | random_bit1 ;
    }
    srand(random) ;
    //rand_count++ ;
    random = rand() ;
    return random;
}

// =======================================================
// set up the fetch/execute, to start the machine
// set machine speed (can be user modified)
// dont mess with this routine, except for timer setting
// =======================================================
void DMA_machine_start(void) {
    
    // pacing timer 100 KHz set denom to 125000000/100000 = 1250
    // 200 KHz  is 625 counter divide
    // 500 KHz is 250 
    // can be user modified, or turned off if desired
    dma_timer_set_fraction ( 3, 1, 50) ;

    // enable sniffer add operation for DMA channel 1 
    // this locks sniffer to chan 1, but DMA1 has to also enable on a per-block basis
    // sniffer_add
    dma_sniffer_enable(execute_chan, sniffer_crc32, true);

    // ======================================================
    // execution machine -- fetch/execute state machine
    // it is very unlikely that you should modify this
    // ======================================================
    // === fix module to reset write address of fetch channel
    // always set to  execute channel write address
    dma_channel_config c2 = dma_channel_get_default_config(fix_chan);
    channel_config_set_transfer_data_size(&c2, DMA_SIZE_32);
    channel_config_set_read_increment(&c2, false);
    channel_config_set_write_increment(&c2, false);
    channel_config_set_irq_quiet(&c2, true);
    channel_config_set_enable(&c2, true); 
    channel_config_set_chain_to(&c2, fetch_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c2,  DREQ_FORCE);
    //
    dma_channel_configure(fix_chan, &c2, 
        &dma_hw->ch[fetch_chan].write_addr , // reset the fetch  to write to execute channel 
        DMA_execute_addr_ptr ,  // read_addr, pointer to address of execute channel
        1, // one words per DMA block
        false) ; // triggered to start machine running

    // === The fetch module
    // this is the program counter and fetch unit
    // when this code is executed it starts the DMAcpu machine!
    // NOTE that this moddule could be paced by a timer or other TREQ 
    //    for DDS, or other operations
    dma_channel_config c0 = dma_channel_get_default_config(fetch_chan);
    channel_config_set_transfer_data_size(&c0, DMA_SIZE_32);
    channel_config_set_read_increment(&c0, true);
    channel_config_set_write_increment(&c0, true);
    channel_config_set_irq_quiet(&c0, true);
    channel_config_set_enable(&c0, true); 
    channel_config_set_chain_to(&c0, execute_chan) ;
    // unpaced, full speed transfer
    channel_config_set_dreq(&c0,  DREQ_FORCE);
    //
    dma_channel_configure(fetch_chan, &c0, 
        &dma_hw->ch[execute_chan].read_addr , // write to dma channel 1
        DMA_blocks_addr ,  // read_addr, start of DMA blocks list
        4, // four words per DMA block
        true) ; // triggered to start machine running
}

// ================================================
// === User written DMA program
// ================================================
void DMA_machine_program(void){
	// init block count
    N = 0 ;  
    // ================================================
    // define blocks to be executed
    // Build the DMA program
    // ================================================
    // build_block(read_addr, write_addr, count, ctrl)
    // ================================================
    // Each of these DMA control blocks will be loaded by
    // the fetch channel into the execute channel, then the data will be moved
    // -- then the fetch channel is fixed and restarted to fetch the next block
    //================= ========== 


    // dma_sniffer_ set to CRC32: CRC32 function code is 0x0 in the calc field
   // build_block(&sniff_calc_mask, DMA_SNIFF_CTRL_CLR, 1, STANDARD_CTRL);
    // compute CRC32 
    // compute CRC32
   // build_block(&rnd_reg, &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN ) ;

   // build_block(&timer_hw->timerawl, DMA_SNIFF_DATA_XOR, 1, 
   // DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_DMA_TIMER3) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    build_block(&rnd_reg, &bit_bucket, 1,  STANDARD_CTRL | SNIFF_EN) ;
    
    //build_block(&rnd_reg, &bit_bucket, 1, STANDARD_CTRL | SNIFF_EN) ;
     // and save for graphics
    build_block(&dma_hw->sniff_data, &dma_rand_out, 1, STANDARD_CTRL) ;
    //build_block(&dma_hw->sniff_data, &dma_sniff_temp, 1, STANDARD_CTRL) ;
    // signal the thread that there is a new value
    build_block(&dma_one, &dma_flag, 1, STANDARD_CTRL);
    // = ===  unconditional jump to start of program
    // To run at full DMA speed, change to DMA_TREQ(DREQ_FORCE) DREQ_DMA_TIMER3
    build_block(&DMA_blocks_addr, &dma_hw->ch[fetch_chan].read_addr, 1, 
        DMA_CHAIN_TO(fix_chan) | DMA_TREQ( DREQ_FORCE) | DMA_DATA_WIDTH(DMA_SIZE_32) | DMA_IRQ_QUIET | DMA_EN) ;

    // === END OF DMA PROGRAM ===
    //  
}

int dma_rand(int rand_mask){
    while(dma_flag==0) {} ;
    dma_flag = 0 ;
    // skip
    while(dma_flag==0) {} ;
    dma_flag = 0 ;
    return(dma_rand_out & rand_mask) ;
}

// ==================================================
// === graphics demo -- RUNNING on core 0
// ==================================================
        
int start_time, end_time ;
short x, y ;

static PT_THREAD (protothread_graphics(struct pt *pt)) {
    PT_BEGIN(pt);
    // the protothreads interval timer
    PT_INTERVAL_INIT() ;

    // background
    fillRect(0, 0, 319, 239, BLACK); // 

    // Draw some filled rectangles
    fillRect(0, 0, 76, 10, BLUE); // blue box
    fillRect(100, 0, 150, 10, WHITE); // red box
    //fillRect(200, 0, 76, 10, GREEN); // green box

    // Write some text
    setTextColor(WHITE) ;
    setCursor(10, 1) ;
    setTextSize(1) ;
    writeString("ECE 4760") ;

    setTextColor(BLACK) ;
    setCursor(102, 1) ;
    setTextSize(1) ;
    writeString("VGA and DMAcpu serial corr ") ;


    // pause
    PT_YIELD_usec(10000) ;

    while(true) {
        //start_time = PT_GET_TIME_usec() ;
        
        
            // update x,y
           // while(dma_flag==0) {} ;
            //        dma_flag = 0 ;
            //x = (dma_rand_out & 0x1ff) ;
            x= dma_rand(0x1ff);

           // while(dma_flag==0) {} ;
           //  dma_flag = 0 ;


           // while(dma_flag==0) {} ;
           //         dma_flag = 0 ;
            //y = (dma_rand_out & 0xff) +11 ;
            y = dma_rand(0xff) + 11 ;

            // draw the new position
            if (x<319 && y<230)  drawPixel(x, y, WHITE) ;
        
        //end_time = PT_GET_TIME_usec() ;
        /*
        setTextColor2(BLACK, WHITE) ;
        setCursor(270, 1) ;
        setTextSize(1) ;
        char vid_str[10] ;
        sprintf(vid_str, "%03d", (end_time-start_time)/1000) ;
        writeString(vid_str) ;
    */
        PT_YIELD_INTERVAL(500); 
   }
   PT_END(pt);
} // graphics thread

// ==================================================
// === toggle25 thread on core 0
// ==================================================
// the on-board LED blinks
static PT_THREAD (protothread_toggle25(struct pt *pt))
{
    PT_BEGIN(pt);
    static bool LED_state = false ;
    
     // set up LED p25 to blink
     gpio_init(25) ;	
     gpio_set_dir(25, GPIO_OUT) ;
     gpio_put(25, true);
     // data structure for interval timer
     PT_INTERVAL_INIT() ;

      while(1) {
        // yield time 0.1 second
        //PT_YIELD_usec(100000) ;
        PT_YIELD_INTERVAL(100000) ;

        // toggle the LED on PICO
        LED_state = LED_state? false : true ;
        gpio_put(25, LED_state);

        // sanity check print
        //printf("%8x\n\r", dma_rand_out) ;
        //
        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // blink thread


// ==================================================
// === user's serial input thread on core 0
// ==================================================
// serial_read an serial_write do not block any thread
// except this one
static int r=7, g=7, b=3 ;
float h,s,v;
static PT_THREAD (protothread_serial(struct pt *pt))
{
    PT_BEGIN(pt);
        char video_buffer[20];
      //
      while(1) {
        // print prompt
        sprintf(pt_serial_out_buffer, "1 to start: ");
        // spawn a thread to do the non-blocking write
        serial_write ;

        // spawn a thread to do the non-blocking serial read
         serial_read ;
        sscanf(pt_serial_in_buffer, "%d  ", &new_value) ;
        //printf("%d\n\r" bit_bucket) ;
        // 
        new_value = true ;
        

        // NEVER exit while
      } // END WHILE(1)
  PT_END(pt);
} // serial thread

// ========================================
// === core 1 main -- started in main below
// ========================================
void core1_main(){ 
  //
  //  === add threads  ====================
  // for core 1
  //pt_add_thread(protothread_toggle_gpio4) ;
  //pt_add_thread(protothread_serial) ;
  //
  // === initalize the scheduler ==========
  pt_schedule_start ;
  // NEVER exits
  // ======================================
}

// ========================================
// === core 0 main
// ========================================
int main(){
  // set the clock
  //set_sys_clock_khz(250000, true); // 171us
  // start the serial i/o
  stdio_init_all() ;
  // announce the threader version on system reset
  printf("\n\rProtothreads RP2040 v1.11 two-core\n\r");

  // Initialize the VGA screen
  initVGA() ;

  // Initialize SPI channel (channel, baud rate set to 20MHz)
    // connected to spi DAC
    spi_init(SPI_PORT, 20000000) ;
    // Format (channel, data bits per transfer, polarity, phase, order)
    spi_set_format(SPI_PORT, 16, 0, 0, 0);
    // Map SPI signals to GPIO ports
    //gpio_set_function(PIN_MISO, GPIO_FUNC_SPI);
    gpio_set_function(PIN_SCK, GPIO_FUNC_SPI);
    gpio_set_function(PIN_MOSI, GPIO_FUNC_SPI);
    gpio_set_function(PIN_CS, GPIO_FUNC_SPI) ;

    //start the ROSC and speed it up
    rosc_setup();

    // seed the CRC computation
    (dma_hw->sniff_data) = rand_rosc_VN() ;
    printf("%x\n\r", (dma_hw->sniff_data)) ;

    // define the DMA program
    DMA_machine_program() ;

    // turn on the DMA coprocessor   
    // runs automomously after this
    DMA_machine_start() ;
     
  // start core 1 threads
  //multicore_reset_core1();
  //multicore_launch_core1(&core1_main);

  // === config threads ========================
  // for core 0
  pt_add_thread(protothread_graphics);
  pt_add_thread(protothread_toggle25);
  pt_add_thread(protothread_serial) ;
  //
  // === initalize the scheduler ===============
  pt_schedule_start ;
  // NEVER exits
  // ===========================================
} // end main