Talking about PCIe and Xillybus
Skip to content
by kevin »
Don't touch the data, but send it exactly as it arrives.
by support »
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`#include <opencv2/core/core.hpp>//#include <opencv2/imgcodecs/imgcodecs.hpp>#include <opencv2/highgui/highgui.hpp>#include <unistd.h>#include <fcntl.h>#include <iostream>#include <fstream> // std::ifstream, std::ofstream#include <string>#include <sys/wait.h>#include <errno.h>#include <cmath>using namespace cv;using namespace std;#define FORK 1#define LOOPBACK 1//define RGB2YUV 1unsigned int image_width;unsigned int image_height;const unsigned int CHNL_NUM = 3;const unsigned int RED_CHNL = 2;const unsigned int GREEN_CHNL = 1;const unsigned int BLUE_CHNL = 0;const unsigned int STREAM_WIDTH = 128; const unsigned int NUM_OF_BITS_PER_BYTE = 8;const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversionsconst unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V]const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5; // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color componentsstruct RGB_packet{ uint8_t R,G,B;};struct YUV_packet{ uint8_t Y,U,V;};struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input) // convert rgb to yuv{ unsigned char R = rgb_input.R; unsigned char G = rgb_input.G; unsigned char B = rgb_input.B; int Y_temp, U_temp, V_temp; struct YUV_packet *yuv_result = (YUV_packet *)malloc(sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL); // https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601 Y_temp = 77*R + 150*G + 29*B; U_temp = -43*R - 84*G + 127*B; V_temp = 127*R - 106*G - 21*B; Y_temp = (Y_temp + 128) >> 8; U_temp = (U_temp + 128) >> 8; V_temp = (V_temp + 128) >> 8; yuv_result->Y = Y_temp; yuv_result->U = U_temp + 128; yuv_result->V = V_temp + 128; return yuv_result;}/* Plain write() may not write all bytes requested in the buffer, so allwrite() loops until all data was indeed written, or exits in case of failure, except for EINTR. The way the EINTR condition is handled is the standard way of making sure the process can be suspended with CTRL-Z and then continue running properly. The function has no return value, because it always succeeds (or exits instead of returning). The function doesn't expect to reach EOF either.*/void allwrite(int fd, unsigned char *buf, unsigned int len) { unsigned int sent = 0; int rc; while (sent < len) { rc = write(fd, buf + sent, len - sent);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } sent += rc; } //return sent;}void allread(int fd, unsigned char *buf, unsigned int len) { unsigned int recvd = 0; int rc; while (recvd < len) { if(len < STREAM_WIDTH/NUM_OF_BITS_PER_BYTE) printf("before last read\n"); rc = read(fd, buf + recvd, len - recvd);//fsync(fd); if(len < STREAM_WIDTH/NUM_OF_BITS_PER_BYTE) printf("rc = %d\n", rc); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } recvd += rc; } //return recvd;}int main(int argc, char *argv[]) { int fdr, fdw; uint8_t *wr_buf, *rd_buf; #ifdef FORK int wait_status; // for wait() pid_t pid; #endif struct RGB_packet *tologic; struct YUV_packet *fromlogic; fdr = open("/dev/xillybus_read_128", O_RDONLY); // will change to /dev/xillybus_read_128 fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128 if ((fdr < 0) || (fdw < 0)) { perror("Failed to open Xillybus device file(s)"); exit(1); } // READ in an image file String imageName( "lena512color.tiff" ); // by default if( argc > 1) { imageName = argv[1]; } Mat image; image = imread( imageName, IMREAD_COLOR ); // Read the file if( image.empty() ) // Check for invalid input { cout << "Could not open or find the image" << std::endl ; return -1; } else { image_width = image.size().width; image_height = image.size().height; } namedWindow( "Original Image", WINDOW_AUTOSIZE ); imshow( "Original Image", image ); Mat rgbchannel[CHNL_NUM]; // The actual splitting. split(image, rgbchannel); namedWindow("Red", WINDOW_AUTOSIZE); imshow("Red", rgbchannel[RED_CHNL]); namedWindow("Green", WINDOW_AUTOSIZE); imshow("Green", rgbchannel[GREEN_CHNL]); namedWindow("Blue", WINDOW_AUTOSIZE); imshow("Blue", rgbchannel[BLUE_CHNL]); waitKey(0); // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation vector<RGB_packet> vTo(image_width * image_height); // lena.tiff is sized as 3*512*512 tologic = vTo.data(); if (!tologic) { fprintf(stderr, "Failed to allocate memory\n"); exit(1); } for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++) { tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index); tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index); tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index); }#ifdef FORK pid = fork(); if (pid < 0) { perror("Failed to fork()"); exit(1); } if (pid) { close(fdr);#endif unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the empty 8 bits //unsigned int if_index = 0; unsigned int rgb_stream_index = 0; uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1]; // could accomodate 5 pixels while (num_of_pixels_sent < image_width * image_height) { if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH) { // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png //if_index++; //printf("if_index = %d\n\r", if_index); //if(if_index == 3) break; for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL) { rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R; rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G; rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B; } rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0'; // however, this NULL character is not sent across write() rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0; // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++) { printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break; }*/ wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits //printf("wr = %d\n", wr); num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH; //printf("num_of_pixels_sent = %d\n", num_of_pixels_sent); } else // the remaining pixels do not fill all five pixel slots for a 128-bit stream { break; // just to send bytes which is divisible by "STREAM_WIDTH/NUM_OF_BITS_PER_BYTE" bytes for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL) { rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R; rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G; rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B; } rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0'; // however, this NULL character is not sent across write() rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0; // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++) { printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); }*/ wr_buf = rgb_stream; // this is a partially filled 128-bit stream (with less than 5 pixels) allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); //printf("wr = %d\n", wr); break; // finish sending all (image_width * image_height) pixels } } write(fdw, NULL, 0); // flush the write stream close(fdw); #ifdef FORK printf("*** Write process enters waiting status .....\n"); pid = wait(&wait_status); printf("*** write process detects read process with pid %d was done ***\n", pid); // most probably write process will be done first, since FPGA computation takes a few clock cyles return 0; } else { close(fdw);#endif vector<YUV_packet> vFrom(image_width * image_height); fromlogic = vFrom.data(); //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y); if (!fromlogic) { fprintf(stderr, "Failed to allocate memory\n"); exit(1); } unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits unsigned int yuv_stream_index = 0; uint8_t yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1]; // could accomodate 5 pixels while (num_of_pixels_received < image_width * image_height) { if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH) { rd_buf = yuv_stream; printf("before read() \n"); allread(fdr, rd_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); printf("after read() \n"); // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels /*for(yuv_stream_index = 0; yuv_stream_index<STREAM_WIDTH/NUM_OF_BITS_PER_BYTE; yuv_stream_index=yuv_stream_index+1) { printf("yuv_stream[%d] = %d\n", yuv_stream_index, yuv_stream[yuv_stream_index]); }*/ yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0'; // this NULL character is only to act as "stop bit" for character array // store the calculated output YUV pixels into fromlogic such that we could reconstruct the image in its original dimension for visual display for(yuv_stream_index = 0; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL) { fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y = yuv_stream[yuv_stream_index]; fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U = yuv_stream[yuv_stream_index+1]; fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V = yuv_stream[yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1]; } num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH; printf("num_of_pixels_received = %d\n\r", num_of_pixels_received); //if(num_of_pixels_received == 40940) break; // just to test if there is actually something being read, or returned from hardware } else // the remaining pixels do not fill all five pixel slots for a 128-bit stream { break; // just to test the rest of received bytes which is divisible by "STREAM_WIDTH/NUM_OF_BITS_PER_BYTE" bytes rd_buf = yuv_stream; printf("before read in else. \n"); allread(fdr, rd_buf, ((image_width * image_height)-num_of_pixels_received)*NUM_OF_COMPONENTS_IN_A_PIXEL); // is a partially filled 128-bit stream (with less than 5 pixels) printf("after read in else. \n"); break; // finish receiving all (image_width * image_height) pixels } } printf("before for loop, data integrity check\n"); for (unsigned int i = 0; i < (image_width * image_height)-num_of_pixels_received; i++) // check the perfomance of hardware with respect to software computation { #ifdef LOOPBACK if( (tologic[i].R != fromlogic[i].Y) || (tologic[i].G != fromlogic[i].U) || (tologic[i].B != fromlogic[i].V) ) #elif RGB2YUV uint8_t expected_Y = rgb2yuv(tologic[i])->Y; uint8_t expected_U = rgb2yuv(tologic[i])->U; uint8_t expected_V = rgb2yuv(tologic[i])->V; if( (abs(expected_Y - fromlogic[i].Y) > 1) || (abs(expected_U - fromlogic[i].U) > 1) || (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation #endif { printf("********************************* Attention *************************************\n\r"); printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B); printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V); #ifdef RGB2YUV printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V); #endif break; // just for troubleshooting //exit(1); } } free(tologic); free(fromlogic); printf("after for loop, data integrity check\n"); close(fdr);#ifdef FORK printf("*** Read process enters waiting status .....\n"); pid = wait(&wait_status); printf("*** read process detects write process with pid %d was done ***\n", pid); // most probably write process will be done first, since FPGA computation takes a few clock cyles return 0; }#endif /*pid = wait(&wait_status); printf("*** Parent detects process %d is done ***\n", pid); printf("*** Parent exits ***\n");*/ exit(0);}
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address test.cpp -o test#define FORK 1#include <unistd.h>#include <fcntl.h>#include <iostream>#include <fstream>#include <sys/wait.h>#include <errno.h>using namespace std;const unsigned int image_width = 512;const unsigned int image_height = 512;const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V]void allwrite(int fd, unsigned char *buf, int len) { int sent = 0; int rc; while (sent < len) { rc = write(fd, buf + sent, len - sent);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } sent += rc; } //return sent;}void allread(int fd, unsigned char *buf, int len) { int recvd = 0; int rc; while (recvd < len) { rc = read(fd, buf + recvd, len - recvd);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } recvd += rc; } //return recvd;}int main() {#ifdef FORK int wait_status; // for wait() pid_t pid;#endif int fdr, fdw; uint8_t *wr_buf, *rd_buf; fdr = open("/dev/xillybus_read_128", O_RDONLY); fdw = open("/dev/xillybus_write_128", O_WRONLY); if ((fdr < 0) || (fdw < 0)) { perror("Failed to open Xillybus device file(s)"); exit(1); } #ifdef FORK pid = fork(); if (pid < 0) { perror("Failed to fork()"); exit(1); } if (pid) { close(fdr);#endif uint8_t rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate all (image_width*image_width) RGB pixels rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // however, this NULL character is not sent across write() for(unsigned int rgb_index=0; rgb_index<(image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_index++) { rgb_stream[rgb_index] = 1; // send all ones to fpga } wr_buf = rgb_stream; allwrite(fdw, wr_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); allwrite(fdw, NULL, 0); // flush the write stream printf("after allwrite() \n"); close(fdw);#ifdef FORK printf("*** Write process enters waiting status .....\n"); pid = wait(&wait_status); printf("*** write process detects read process with pid %d was done ***\n", pid); // most probably write process will be done first, since FPGA computation takes a few clock cyles return 0; } else { close(fdw);#endif uint8_t yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate (image_width*image_width) YUV pixels yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // this NULL character is only to act as "stop bit" for character array rd_buf = yuv_stream; printf("before allread() \n"); allread(fdr, rd_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); printf("after allread() \n"); close(fdr); #ifdef FORK printf("*** Read process enters waiting status .....\n"); pid = wait(&wait_status); printf("*** read process detects write process with pid %d was done ***\n", pid); // most probably write process will be done first, since FPGA computation takes a few clock cyles return 0; }#endif}
Since you're using a 128-bit wide stream, you have to make sure that the length of sent file is dividable by 16 (bytes, 16 x 8 = 128), or the last word won't be sent to the FPGA.
`define LOOPBACK 1module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P); localparam STREAM_WIDTH = 128; input PCIE_PERST_B_LS; input PCIE_REFCLK_N; input PCIE_REFCLK_P; input [7:0] PCIE_RX_N; input [7:0] PCIE_RX_P; output [3:0] GPIO_LED; output [7:0] PCIE_TX_N; output [7:0] PCIE_TX_P; // Clock and quiesce wire bus_clk; wire quiesce; // Memory array reg [7:0] demoarray[0:31]; // Wires related to /dev/xillybus_mem_128 wire user_r_mem_128_rden; wire user_r_mem_128_empty; reg [STREAM_WIDTH-1:0] user_r_mem_128_data; wire user_r_mem_128_eof; wire user_r_mem_128_open; wire user_w_mem_128_wren; wire user_w_mem_128_full; wire [STREAM_WIDTH-1:0] user_w_mem_128_data; wire user_w_mem_128_open; wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr; wire user_mem_128_addr_update; // Wires related to /dev/xillybus_read_128 wire user_r_read_128_rden; wire user_r_read_128_empty; wire [STREAM_WIDTH-1:0] user_r_read_128_data; wire user_r_read_128_eof; wire user_r_read_128_open; // Wires related to /dev/xillybus_write_128 wire user_w_write_128_wren; wire user_w_write_128_full; wire [STREAM_WIDTH-1:0] user_w_write_128_data; wire user_w_write_128_open; // Wires related to /dev/xillybus_read_256 wire user_r_read_256_rden; wire user_r_read_256_empty; wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data; wire user_r_read_256_eof; wire user_r_read_256_open; // Wires related to /dev/xillybus_write_256 wire user_w_write_256_wren; wire user_w_write_256_full; wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data; wire user_w_write_256_open; xillybus xillybus_ins ( // Ports related to /dev/xillybus_mem_128 // FPGA to CPU signals: .user_r_mem_128_rden(user_r_mem_128_rden), .user_r_mem_128_empty(user_r_mem_128_empty), .user_r_mem_128_data(user_r_mem_128_data), .user_r_mem_128_eof(user_r_mem_128_eof), .user_r_mem_128_open(user_r_mem_128_open), // CPU to FPGA signals: .user_w_mem_128_wren(user_w_mem_128_wren), .user_w_mem_128_full(user_w_mem_128_full), .user_w_mem_128_data(user_w_mem_128_data), .user_w_mem_128_open(user_w_mem_128_open), // Address signals: .user_mem_128_addr(user_mem_128_addr), .user_mem_128_addr_update(user_mem_128_addr_update), // Ports related to /dev/xillybus_read_256 // FPGA to CPU signals: .user_r_read_256_rden(user_r_read_256_rden), .user_r_read_256_empty(user_r_read_256_empty), .user_r_read_256_data(user_r_read_256_data), .user_r_read_256_eof(user_r_read_256_eof), .user_r_read_256_open(user_r_read_256_open), // Ports related to /dev/xillybus_write_256 // CPU to FPGA signals: .user_w_write_256_wren(user_w_write_256_wren), .user_w_write_256_full(user_w_write_256_full), .user_w_write_256_data(user_w_write_256_data), .user_w_write_256_open(user_w_write_256_open), // Ports related to /dev/xillybus_read_128 // FPGA to CPU signals: .user_r_read_128_rden(user_r_read_128_rden), .user_r_read_128_empty(user_r_read_128_empty), .user_r_read_128_data(user_r_read_128_data), .user_r_read_128_eof(user_r_read_128_eof), .user_r_read_128_open(user_r_read_128_open), // Ports related to /dev/xillybus_write_128 // CPU to FPGA signals: .user_w_write_128_wren(user_w_write_128_wren), .user_w_write_128_full(user_w_write_128_full), .user_w_write_128_data(user_w_write_128_data), .user_w_write_128_open(user_w_write_128_open), // Signals to top level .PCIE_PERST_B_LS(PCIE_PERST_B_LS), .PCIE_REFCLK_N(PCIE_REFCLK_N), .PCIE_REFCLK_P(PCIE_REFCLK_P), .PCIE_RX_N(PCIE_RX_N), .PCIE_RX_P(PCIE_RX_P), .GPIO_LED(GPIO_LED), .PCIE_TX_N(PCIE_TX_N), .PCIE_TX_P(PCIE_TX_P), .bus_clk(bus_clk), .quiesce(quiesce) ); // A simple inferred RAM always @(posedge bus_clk) begin if (user_w_mem_128_wren) demoarray[user_mem_128_addr] <= user_w_mem_128_data; if (user_r_mem_128_rden) user_r_mem_128_data <= demoarray[user_mem_128_addr]; end assign user_r_mem_128_empty = 0; assign user_r_mem_128_eof = 0; assign user_w_mem_128_full = 0;`ifdef LOOPBACK wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo; // 128-bit loopback fifo_128 fifo_128x128 ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(user_w_write_128_full), .empty(user_r_read_128_empty), .count(data_count_of_loopback_fifo) ); assign user_r_read_128_eof = 0; localparam TOTAL_NUM_OF_PIXELS = 512*512; // the image is sized as 3*512*512 , width=512 and height=512 localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V] localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions // to check if xillybus has transmitted all pixels data through the loopback fifo reg [$clog2(TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH)-1:0] number_of_128_bit_data_passed_through_loopback_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_128_bit_data_passed_through_loopback_fifo <= 0; else if(user_w_write_128_wren && (number_of_128_bit_data_passed_through_loopback_fifo < (TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH))) number_of_128_bit_data_passed_through_loopback_fifo <= number_of_128_bit_data_passed_through_loopback_fifo + 1; // for every xillybus transaction, input fifo should receive 128 bits, or equivalently 'KERNEL_NUM' pieces of pixels end // Vivado built-in internal logic analyzer module instantiation ila_1 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(user_r_read_128_data), .probe2(data_count_of_loopback_fifo), .probe3(user_w_write_128_full), .probe4(user_w_write_128_wren), .probe5(user_r_read_128_rden), .probe6(user_r_read_128_empty), .probe7(user_w_write_128_open), .probe8(user_r_read_128_open), .probe9(number_of_128_bit_data_passed_through_loopback_fifo) ); `else// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.// A pixel occupies 3*8=24 bits. Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128, // computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128 localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] , output:[Y, U, V] localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions localparam KERNEL_NUM = 5; // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512// Signals for two buffer FIFOs localparam FIFO_DEPTH = 16; wire [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo; // determines whether all five pixel slots have incoming data or not wire [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo; //-------------------------------------------kernel----------------------------------------// wire [STREAM_WIDTH-1:0] stream_i_V_V_dout; // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_i_V_V_empty; // Empty condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active wire [STREAM_WIDTH-1:0] stream_o_V_V_din; // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_o_V_V_full; // Full condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write; // Write enable for each color components of all five pixels, high active wire is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren); // the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time) AND the input FIFO is outputting pixels reg [KERNEL_NUM-1:0] ap_start = 0; // initially the HLS kernels are not started wire [KERNEL_NUM-1:0] ap_done; wire [KERNEL_NUM-1:0] ap_idle; wire [KERNEL_NUM-1:0] ap_ready; always @(posedge bus_clk) ap_start <= (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not // -----------------input FIFO ----------------------------------// fifo_fwft_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) input_pipe( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(&stream_i_V_V_read), .value_o(stream_i_V_V_dout), .full(user_w_write_128_full), .empty(stream_i_V_V_empty), .count(data_count_of_input_fifo) ); // to check if xillybus has transmitted all pixels data to the input_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_received_by_input_fifo <= 0; else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM; // for every xillybus transaction, input fifo should receive 'KERNEL_NUM' pieces of pixels end // use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels generate genvar kn; // to indicate which kernel for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin kernel RGB2YUV_kn ( .ap_clk(bus_clk), .ap_rst(!user_w_write_128_open && !user_r_read_128_open), .ap_start(ap_start[kn]), // need to confirm ? .ap_done(ap_done[kn]), .ap_idle(ap_idle[kn]), .ap_ready(ap_ready[kn]), .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // input component R with (PIXEL_VALUE_RANGE) bits .stream_i0_V_V_empty_n(!stream_i_V_V_empty), .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // input component G with (PIXEL_VALUE_RANGE) bits .stream_i1_V_V_empty_n(!stream_i_V_V_empty), .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // input component B with (PIXEL_VALUE_RANGE) bits .stream_i2_V_V_empty_n(!stream_i_V_V_empty), .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]), .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // output component Y with (PIXEL_VALUE_RANGE) bits .stream_o0_V_V_full_n(!stream_o_V_V_full), .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // output component U with (PIXEL_VALUE_RANGE) bits .stream_o1_V_V_full_n(!stream_o_V_V_full), .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // output component V with (PIXEL_VALUE_RANGE) bits .stream_o2_V_V_full_n(!stream_o_V_V_full), .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]) ); end endgenerate assign stream_o_V_V_din[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE] = 0; // (note that we neglected the most significant 8 bits)//----------------------output FIFO-----------------------------// fifo_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) output_pipe ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(stream_o_V_V_din), .enqueue_en(&stream_o_V_V_write), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(stream_o_V_V_full), .empty(user_r_read_128_empty), .count(data_count_of_output_fifo) ); // to check if xillybus has transmitted all pixels data from the output_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0; // initially nothing is sent always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_sent_by_output_fifo <= 0; else if(user_r_read_128_rden && (number_of_pixels_sent_by_output_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM; // for every xillybus transaction, output fifo should send 'KERNEL_NUM' pieces of pixels end assign user_r_read_128_eof = 0; // Vivado built-in internal logic analyzer module instantiation ila_0 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(stream_i_V_V_dout), .probe2(stream_o_V_V_din), .probe3(user_r_read_128_data), .probe4(stream_i_V_V_read), .probe5(stream_o_V_V_write), .probe6(data_count_of_input_fifo), .probe7(data_count_of_output_fifo), .probe8(user_w_write_128_full), .probe9(stream_i_V_V_empty), .probe10(user_w_write_128_wren), .probe11(user_r_read_128_rden), .probe12(stream_o_V_V_full), .probe13(user_r_read_128_empty), .probe14(user_w_write_128_open), .probe15(user_r_read_128_open), .probe16(ap_start), .probe17(ap_done), .probe18(ap_idle), .probe19(ap_ready), .probe20(is_last_few_pixels), .probe21(number_of_pixels_received_by_input_fifo), .probe22(number_of_pixels_sent_by_output_fifo) );`endif endmodule
`define LOOPBACK 1module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P); localparam STREAM_WIDTH = 128; input PCIE_PERST_B_LS; input PCIE_REFCLK_N; input PCIE_REFCLK_P; input [7:0] PCIE_RX_N; input [7:0] PCIE_RX_P; output [3:0] GPIO_LED; output [7:0] PCIE_TX_N; output [7:0] PCIE_TX_P; // Clock and quiesce wire bus_clk; wire quiesce; // Memory array reg [7:0] demoarray[0:31]; // Wires related to /dev/xillybus_mem_128 wire user_r_mem_128_rden; wire user_r_mem_128_empty; reg [STREAM_WIDTH-1:0] user_r_mem_128_data; wire user_r_mem_128_eof; wire user_r_mem_128_open; wire user_w_mem_128_wren; wire user_w_mem_128_full; wire [STREAM_WIDTH-1:0] user_w_mem_128_data; wire user_w_mem_128_open; wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr; wire user_mem_128_addr_update; // Wires related to /dev/xillybus_read_128 wire user_r_read_128_rden; wire user_r_read_128_empty; wire [STREAM_WIDTH-1:0] user_r_read_128_data; wire user_r_read_128_eof; wire user_r_read_128_open; // Wires related to /dev/xillybus_write_128 wire user_w_write_128_wren; wire user_w_write_128_full; wire [STREAM_WIDTH-1:0] user_w_write_128_data; wire user_w_write_128_open; // Wires related to /dev/xillybus_read_256 wire user_r_read_256_rden; wire user_r_read_256_empty; wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data; wire user_r_read_256_eof; wire user_r_read_256_open; // Wires related to /dev/xillybus_write_256 wire user_w_write_256_wren; wire user_w_write_256_full; wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data; wire user_w_write_256_open; xillybus xillybus_ins ( // Ports related to /dev/xillybus_mem_128 // FPGA to CPU signals: .user_r_mem_128_rden(user_r_mem_128_rden), .user_r_mem_128_empty(user_r_mem_128_empty), .user_r_mem_128_data(user_r_mem_128_data), .user_r_mem_128_eof(user_r_mem_128_eof), .user_r_mem_128_open(user_r_mem_128_open), // CPU to FPGA signals: .user_w_mem_128_wren(user_w_mem_128_wren), .user_w_mem_128_full(user_w_mem_128_full), .user_w_mem_128_data(user_w_mem_128_data), .user_w_mem_128_open(user_w_mem_128_open), // Address signals: .user_mem_128_addr(user_mem_128_addr), .user_mem_128_addr_update(user_mem_128_addr_update), // Ports related to /dev/xillybus_read_256 // FPGA to CPU signals: .user_r_read_256_rden(user_r_read_256_rden), .user_r_read_256_empty(user_r_read_256_empty), .user_r_read_256_data(user_r_read_256_data), .user_r_read_256_eof(user_r_read_256_eof), .user_r_read_256_open(user_r_read_256_open), // Ports related to /dev/xillybus_write_256 // CPU to FPGA signals: .user_w_write_256_wren(user_w_write_256_wren), .user_w_write_256_full(user_w_write_256_full), .user_w_write_256_data(user_w_write_256_data), .user_w_write_256_open(user_w_write_256_open), // Ports related to /dev/xillybus_read_128 // FPGA to CPU signals: .user_r_read_128_rden(user_r_read_128_rden), .user_r_read_128_empty(user_r_read_128_empty), .user_r_read_128_data(user_r_read_128_data), .user_r_read_128_eof(user_r_read_128_eof), .user_r_read_128_open(user_r_read_128_open), // Ports related to /dev/xillybus_write_128 // CPU to FPGA signals: .user_w_write_128_wren(user_w_write_128_wren), .user_w_write_128_full(user_w_write_128_full), .user_w_write_128_data(user_w_write_128_data), .user_w_write_128_open(user_w_write_128_open), // Signals to top level .PCIE_PERST_B_LS(PCIE_PERST_B_LS), .PCIE_REFCLK_N(PCIE_REFCLK_N), .PCIE_REFCLK_P(PCIE_REFCLK_P), .PCIE_RX_N(PCIE_RX_N), .PCIE_RX_P(PCIE_RX_P), .GPIO_LED(GPIO_LED), .PCIE_TX_N(PCIE_TX_N), .PCIE_TX_P(PCIE_TX_P), .bus_clk(bus_clk), .quiesce(quiesce) ); // A simple inferred RAM always @(posedge bus_clk) begin if (user_w_mem_128_wren) demoarray[user_mem_128_addr] <= user_w_mem_128_data; if (user_r_mem_128_rden) user_r_mem_128_data <= demoarray[user_mem_128_addr]; end assign user_r_mem_128_empty = 0; assign user_r_mem_128_eof = 0; assign user_w_mem_128_full = 0;`ifdef LOOPBACK wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo; // 128-bit loopback fifo_128 fifo_128x128 ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(user_w_write_128_full), .empty(user_r_read_128_empty), .count(data_count_of_loopback_fifo) ); assign user_r_read_128_eof = 0; localparam TOTAL_NUM_OF_PIXELS = 512*512; // the image is sized as 3*512*512 , width=512 and height=512 localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V] localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions // to check if xillybus has transmitted all pixels data through the loopback fifo reg [$clog2(TOTAL_NUM_OF_PIXELS*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE/STREAM_WIDTH)-1:0] number_of_128_bit_data_passed_through_loopback_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_128_bit_data_passed_through_loopback_fifo <= 0; else if(user_w_write_128_wren && (number_of_128_bit_data_passed_through_loopback_fifo < (TOTAL_NUM_OF_PIXELS/STREAM_WIDTH))) number_of_128_bit_data_passed_through_loopback_fifo <= number_of_128_bit_data_passed_through_loopback_fifo + 1; // for every xillybus transaction, input fifo should receive 128 bits, or equivalently 'KERNEL_NUM' pieces of pixels end // Vivado built-in internal logic analyzer module instantiation ila_1 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(user_r_read_128_data), .probe2(data_count_of_loopback_fifo), .probe3(user_w_write_128_full), .probe4(user_w_write_128_wren), .probe5(user_r_read_128_rden), .probe6(user_r_read_128_empty), .probe7(user_w_write_128_open), .probe8(user_r_read_128_open), .probe9(number_of_128_bit_data_passed_through_loopback_fifo) ); `else// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.// A pixel occupies 3*8=24 bits. Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128, // computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128 localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] , output:[Y, U, V] localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions localparam KERNEL_NUM = 5; // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512// Signals for two buffer FIFOs localparam FIFO_DEPTH = 16; wire [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo; // determines whether all five pixel slots have incoming data or not wire [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo; //-------------------------------------------kernel----------------------------------------// wire [STREAM_WIDTH-1:0] stream_i_V_V_dout; // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_i_V_V_empty; // Empty condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active wire [STREAM_WIDTH-1:0] stream_o_V_V_din; // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_o_V_V_full; // Full condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write; // Write enable for each color components of all five pixels, high active wire is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren); // the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time) AND the input FIFO is outputting pixels reg [KERNEL_NUM-1:0] ap_start = 0; // initially the HLS kernels are not started wire [KERNEL_NUM-1:0] ap_done; wire [KERNEL_NUM-1:0] ap_idle; wire [KERNEL_NUM-1:0] ap_ready; always @(posedge bus_clk) ap_start <= (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not // -----------------input FIFO ----------------------------------// fifo_fwft_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) input_pipe( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(&stream_i_V_V_read), .value_o(stream_i_V_V_dout), .full(user_w_write_128_full), .empty(stream_i_V_V_empty), .count(data_count_of_input_fifo) ); // to check if xillybus has transmitted all pixels data to the input_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_received_by_input_fifo <= 0; else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM; // for every xillybus transaction, input fifo should receive 'KERNEL_NUM' pieces of pixels end // use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels generate genvar kn; // to indicate which kernel for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin kernel RGB2YUV_kn ( .ap_clk(bus_clk), .ap_rst(!user_w_write_128_open && !user_r_read_128_open), .ap_start(ap_start[kn]), // need to confirm ? .ap_done(ap_done[kn]), .ap_idle(ap_idle[kn]), .ap_ready(ap_ready[kn]), .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // input component R with (PIXEL_VALUE_RANGE) bits .stream_i0_V_V_empty_n(!stream_i_V_V_empty), .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // input component G with (PIXEL_VALUE_RANGE) bits .stream_i1_V_V_empty_n(!stream_i_V_V_empty), .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // input component B with (PIXEL_VALUE_RANGE) bits .stream_i2_V_V_empty_n(!stream_i_V_V_empty), .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]), .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // output component Y with (PIXEL_VALUE_RANGE) bits .stream_o0_V_V_full_n(!stream_o_V_V_full), .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // output component U with (PIXEL_VALUE_RANGE) bits .stream_o1_V_V_full_n(!stream_o_V_V_full), .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // output component V with (PIXEL_VALUE_RANGE) bits .stream_o2_V_V_full_n(!stream_o_V_V_full), .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]) ); end endgenerate assign stream_o_V_V_din[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE] = 0; // (note that we neglected the most significant 8 bits)//----------------------output FIFO-----------------------------// fifo_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) output_pipe ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(stream_o_V_V_din), .enqueue_en(&stream_o_V_V_write), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(stream_o_V_V_full), .empty(user_r_read_128_empty), .count(data_count_of_output_fifo) ); // to check if xillybus has transmitted all pixels data from the output_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0; // initially nothing is sent always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_sent_by_output_fifo <= 0; else if(user_r_read_128_rden && (number_of_pixels_sent_by_output_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM; // for every xillybus transaction, output fifo should send 'KERNEL_NUM' pieces of pixels end assign user_r_read_128_eof = 0; // Vivado built-in internal logic analyzer module instantiation ila_0 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(stream_i_V_V_dout), .probe2(stream_o_V_V_din), .probe3(user_r_read_128_data), .probe4(stream_i_V_V_read), .probe5(stream_o_V_V_write), .probe6(data_count_of_input_fifo), .probe7(data_count_of_output_fifo), .probe8(user_w_write_128_full), .probe9(stream_i_V_V_empty), .probe10(user_w_write_128_wren), .probe11(user_r_read_128_rden), .probe12(stream_o_V_V_full), .probe13(user_r_read_128_empty), .probe14(user_w_write_128_open), .probe15(user_r_read_128_open), .probe16(ap_start), .probe17(ap_done), .probe18(ap_idle), .probe19(ap_ready), .probe20(is_last_few_pixels), .probe21(number_of_pixels_received_by_input_fifo), .probe22(number_of_pixels_sent_by_output_fifo) );`endif endmodule
by Guest »
The common solution is to send the data untouched (typically 4 bytes per pixel) and let the FPGA handle the gory details. Any CPU processing is likely to turn into a bottleneck.
idea of using plain disk files and cat / dd for interface with Xillybus.
`define LOOPBACK 1module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P); localparam STREAM_WIDTH = 128; input PCIE_PERST_B_LS; input PCIE_REFCLK_N; input PCIE_REFCLK_P; input [7:0] PCIE_RX_N; input [7:0] PCIE_RX_P; output [3:0] GPIO_LED; output [7:0] PCIE_TX_N; output [7:0] PCIE_TX_P; // Clock and quiesce wire bus_clk; wire quiesce; // Memory array reg [7:0] demoarray[0:31]; // Wires related to /dev/xillybus_mem_128 wire user_r_mem_128_rden; wire user_r_mem_128_empty; reg [STREAM_WIDTH-1:0] user_r_mem_128_data; wire user_r_mem_128_eof; wire user_r_mem_128_open; wire user_w_mem_128_wren; wire user_w_mem_128_full; wire [STREAM_WIDTH-1:0] user_w_mem_128_data; wire user_w_mem_128_open; wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr; wire user_mem_128_addr_update; // Wires related to /dev/xillybus_read_128 wire user_r_read_128_rden; wire user_r_read_128_empty; wire [STREAM_WIDTH-1:0] user_r_read_128_data; wire user_r_read_128_eof; wire user_r_read_128_open; // Wires related to /dev/xillybus_write_128 wire user_w_write_128_wren; wire user_w_write_128_full; wire [STREAM_WIDTH-1:0] user_w_write_128_data; wire user_w_write_128_open; // Wires related to /dev/xillybus_read_256 wire user_r_read_256_rden; wire user_r_read_256_empty; wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data; wire user_r_read_256_eof; wire user_r_read_256_open; // Wires related to /dev/xillybus_write_256 wire user_w_write_256_wren; wire user_w_write_256_full; wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data; wire user_w_write_256_open; xillybus xillybus_ins ( // Ports related to /dev/xillybus_mem_128 // FPGA to CPU signals: .user_r_mem_128_rden(user_r_mem_128_rden), .user_r_mem_128_empty(user_r_mem_128_empty), .user_r_mem_128_data(user_r_mem_128_data), .user_r_mem_128_eof(user_r_mem_128_eof), .user_r_mem_128_open(user_r_mem_128_open), // CPU to FPGA signals: .user_w_mem_128_wren(user_w_mem_128_wren), .user_w_mem_128_full(user_w_mem_128_full), .user_w_mem_128_data(user_w_mem_128_data), .user_w_mem_128_open(user_w_mem_128_open), // Address signals: .user_mem_128_addr(user_mem_128_addr), .user_mem_128_addr_update(user_mem_128_addr_update), // Ports related to /dev/xillybus_read_256 // FPGA to CPU signals: .user_r_read_256_rden(user_r_read_256_rden), .user_r_read_256_empty(user_r_read_256_empty), .user_r_read_256_data(user_r_read_256_data), .user_r_read_256_eof(user_r_read_256_eof), .user_r_read_256_open(user_r_read_256_open), // Ports related to /dev/xillybus_write_256 // CPU to FPGA signals: .user_w_write_256_wren(user_w_write_256_wren), .user_w_write_256_full(user_w_write_256_full), .user_w_write_256_data(user_w_write_256_data), .user_w_write_256_open(user_w_write_256_open), // Ports related to /dev/xillybus_read_128 // FPGA to CPU signals: .user_r_read_128_rden(user_r_read_128_rden), .user_r_read_128_empty(user_r_read_128_empty), .user_r_read_128_data(user_r_read_128_data), .user_r_read_128_eof(user_r_read_128_eof), .user_r_read_128_open(user_r_read_128_open), // Ports related to /dev/xillybus_write_128 // CPU to FPGA signals: .user_w_write_128_wren(user_w_write_128_wren), .user_w_write_128_full(user_w_write_128_full), .user_w_write_128_data(user_w_write_128_data), .user_w_write_128_open(user_w_write_128_open), // Signals to top level .PCIE_PERST_B_LS(PCIE_PERST_B_LS), .PCIE_REFCLK_N(PCIE_REFCLK_N), .PCIE_REFCLK_P(PCIE_REFCLK_P), .PCIE_RX_N(PCIE_RX_N), .PCIE_RX_P(PCIE_RX_P), .GPIO_LED(GPIO_LED), .PCIE_TX_N(PCIE_TX_N), .PCIE_TX_P(PCIE_TX_P), .bus_clk(bus_clk), .quiesce(quiesce) ); // A simple inferred RAM always @(posedge bus_clk) begin if (user_w_mem_128_wren) demoarray[user_mem_128_addr] <= user_w_mem_128_data; if (user_r_mem_128_rden) user_r_mem_128_data <= demoarray[user_mem_128_addr]; end assign user_r_mem_128_empty = 0; assign user_r_mem_128_eof = 0; assign user_w_mem_128_full = 0;`ifdef LOOPBACK wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo; // 128-bit loopback fifo_128 fifo_128x128 ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(user_w_write_128_full), .empty(user_r_read_128_empty), .count(data_count_of_loopback_fifo) ); assign user_r_read_128_eof = 0; localparam TOTAL_NUM_OF_PIXELS = 512*512; // the image is sized as 3*512*512 , width=512 and height=512 // to check if xillybus has transmitted all pixels data through the loopback fifo reg [$clog2(TOTAL_NUM_OF_PIXELS/STREAM_WIDTH)-1:0] number_of_128_bit_data_passed_through_loopback_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_128_bit_data_passed_through_loopback_fifo <= 0; else if(user_w_write_128_wren && (number_of_128_bit_data_passed_through_loopback_fifo < (TOTAL_NUM_OF_PIXELS/STREAM_WIDTH))) number_of_128_bit_data_passed_through_loopback_fifo <= number_of_128_bit_data_passed_through_loopback_fifo + 1; // for every xillybus transaction, input fifo should receive 128 bits, or equivalently 'KERNEL_NUM' pieces of pixels end // Vivado built-in internal logic analyzer module instantiation ila_1 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(user_r_read_128_data), .probe2(data_count_of_loopback_fifo), .probe3(user_w_write_128_full), .probe4(user_w_write_128_wren), .probe5(user_r_read_128_rden), .probe6(user_r_read_128_empty), .probe7(user_w_write_128_open), .probe8(user_r_read_128_open), .probe9(number_of_128_bit_data_passed_through_loopback_fifo) ); `else// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.// A pixel occupies 3*8=24 bits. Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128, // computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128 localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] , output:[Y, U, V] localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions localparam KERNEL_NUM = 5; // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512// Signals for two buffer FIFOs localparam FIFO_DEPTH = 16; wire [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo; // determines whether all five pixel slots have incoming data or not wire [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo; //-------------------------------------------kernel----------------------------------------// wire [STREAM_WIDTH-1:0] stream_i_V_V_dout; // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_i_V_V_empty; // Empty condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active wire [STREAM_WIDTH-1:0] stream_o_V_V_din; // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits) wire stream_o_V_V_full; // Full condition wire [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write; // Write enable for each color components of all five pixels, high active wire is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren); // the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time) AND the input FIFO is outputting pixels reg [KERNEL_NUM-1:0] ap_start = 0; // initially the HLS kernels are not started wire [KERNEL_NUM-1:0] ap_done; wire [KERNEL_NUM-1:0] ap_idle; wire [KERNEL_NUM-1:0] ap_ready; always @(posedge bus_clk) ap_start <= (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not // -----------------input FIFO ----------------------------------// fifo_fwft_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) input_pipe( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(user_w_write_128_data), .enqueue_en(user_w_write_128_wren), .dequeue_en(&stream_i_V_V_read), .value_o(stream_i_V_V_dout), .full(user_w_write_128_full), .empty(stream_i_V_V_empty), .count(data_count_of_input_fifo) ); // to check if xillybus has transmitted all pixels data to the input_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0; // initially nothing is received always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_received_by_input_fifo <= 0; else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM; // for every xillybus transaction, input fifo should receive 'KERNEL_NUM' pieces of pixels end // use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels generate genvar kn; // to indicate which kernel for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin kernel RGB2YUV_kn ( .ap_clk(bus_clk), .ap_rst(!user_w_write_128_open && !user_r_read_128_open), .ap_start(ap_start[kn]), // need to confirm ? .ap_done(ap_done[kn]), .ap_idle(ap_idle[kn]), .ap_ready(ap_ready[kn]), .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // input component R with (PIXEL_VALUE_RANGE) bits .stream_i0_V_V_empty_n(!stream_i_V_V_empty), .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // input component G with (PIXEL_VALUE_RANGE) bits .stream_i1_V_V_empty_n(!stream_i_V_V_empty), .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // input component B with (PIXEL_VALUE_RANGE) bits .stream_i2_V_V_empty_n(!stream_i_V_V_empty), .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]), .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]), // output component Y with (PIXEL_VALUE_RANGE) bits .stream_o0_V_V_full_n(!stream_o_V_V_full), .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]), .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]), // output component U with (PIXEL_VALUE_RANGE) bits .stream_o1_V_V_full_n(!stream_o_V_V_full), .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]), .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]), // output component V with (PIXEL_VALUE_RANGE) bits .stream_o2_V_V_full_n(!stream_o_V_V_full), .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]) ); end endgenerate assign stream_o_V_V_din[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE] = 0; // (note that we neglected the most significant 8 bits)//----------------------output FIFO-----------------------------// fifo_128 #( .WIDTH(STREAM_WIDTH), .SIZE(FIFO_DEPTH) ) output_pipe ( .clk(bus_clk), .reset(!user_w_write_128_open && !user_r_read_128_open), .flush_en(0), .value_i(stream_o_V_V_din), .enqueue_en(&stream_o_V_V_write), .dequeue_en(user_r_read_128_rden), .value_o(user_r_read_128_data), .full(stream_o_V_V_full), .empty(user_r_read_128_empty), .count(data_count_of_output_fifo) ); // to check if xillybus has transmitted all pixels data from the output_pipe fifo reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0; // initially nothing is sent always@ (posedge bus_clk) begin if(!user_w_write_128_open && !user_r_read_128_open) number_of_pixels_sent_by_output_fifo <= 0; else if(user_r_read_128_rden && (number_of_pixels_sent_by_output_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM))) number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM; // for every xillybus transaction, output fifo should send 'KERNEL_NUM' pieces of pixels end assign user_r_read_128_eof = 0; // Vivado built-in internal logic analyzer module instantiation ila_0 ila( .clk(bus_clk), .probe0(user_w_write_128_data), .probe1(stream_i_V_V_dout), .probe2(stream_o_V_V_din), .probe3(user_r_read_128_data), .probe4(stream_i_V_V_read), .probe5(stream_o_V_V_write), .probe6(data_count_of_input_fifo), .probe7(data_count_of_output_fifo), .probe8(user_w_write_128_full), .probe9(stream_i_V_V_empty), .probe10(user_w_write_128_wren), .probe11(user_r_read_128_rden), .probe12(stream_o_V_V_full), .probe13(user_r_read_128_empty), .probe14(user_w_write_128_open), .probe15(user_r_read_128_open), .probe16(ap_start), .probe17(ap_done), .probe18(ap_idle), .probe19(ap_ready), .probe20(is_last_few_pixels), .probe21(number_of_pixels_received_by_input_fifo), .probe22(number_of_pixels_sent_by_output_fifo) );`endif endmodule
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address test.cpp -o test#include <unistd.h>#include <fcntl.h>#include <iostream>#include <fstream>#include <errno.h>using namespace std;const unsigned int image_width = 512;const unsigned int image_height = 512;const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V]void allwrite(int fd, unsigned char *buf, int len) { int sent = 0; int rc; while (sent < len) { rc = write(fd, buf + sent, len - sent);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } sent += rc; } //return sent;}void allread(int fd, unsigned char *buf, int len) { int recvd = 0; int rc; while (recvd < len) { rc = read(fd, buf + recvd, len - recvd);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } recvd += rc; } //return recvd;}int main() { int fdr, fdw; uint8_t *wr_buf, *rd_buf; fdr = open("/dev/xillybus_read_128", O_RDONLY); fdw = open("/dev/xillybus_write_128", O_WRONLY); if ((fdr < 0) || (fdw < 0)) { perror("Failed to open Xillybus device file(s)"); exit(1); } uint8_t rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate all (image_width*image_width) RGB pixels rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // however, this NULL character is not sent across write() for(unsigned int rgb_index=0; rgb_index<(image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_index++) { rgb_stream[rgb_index] = 1; // send all ones to fpga } wr_buf = rgb_stream; allwrite(fdw, wr_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); allwrite(fdw, NULL, 0); // flush the write stream printf("after allwrite() \n"); close(fdw); uint8_t yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate (image_width*image_width) YUV pixels yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // this NULL character is only to act as "stop bit" for character array rd_buf = yuv_stream; printf("before allread() \n"); allread(fdr, rd_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); printf("after allread() \n"); close(fdr); }
strace and fork affects the results significantly
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address test.cpp -o test#include <unistd.h>#include <fcntl.h>#include <iostream>#include <fstream>#include <errno.h>using namespace std;const unsigned int image_width = 510;const unsigned int image_height = 510;const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B] output:[Y, U, V]void allwrite(int fd, unsigned char *buf, int len) { int sent = 0; int rc; while (sent < len) { rc = write(fd, buf + sent, len - sent);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } sent += rc; } //return sent;}void allread(int fd, unsigned char *buf, int len) { int recvd = 0; int rc; while (recvd < len) { rc = read(fd, buf + recvd, len - recvd);//fsync(fd); if ((rc < 0) && (errno == EINTR)) continue; if (rc < 0) { perror("allwrite() failed to write"); exit(1); } if (rc == 0) { fprintf(stderr, "Reached write EOF (?!)\n"); exit(1); } recvd += rc; } //return recvd;}int main() { int fdr, fdw; uint8_t *wr_buf, *rd_buf; fdr = open("/dev/xillybus_read_128", O_RDONLY); fdw = open("/dev/xillybus_write_128", O_WRONLY); if ((fdr < 0) || (fdw < 0)) { perror("Failed to open Xillybus device file(s)"); exit(1); } uint8_t rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate all (image_width*image_width) RGB pixels rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // however, this NULL character is not sent across write() for(unsigned int rgb_index=0; rgb_index<(image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_index++) { rgb_stream[rgb_index] = 0; // send all zeroes to fpga } wr_buf = rgb_stream; allwrite(fdw, wr_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); allwrite(fdw, NULL, 0); // flush the write stream printf("after allwrite() \n"); close(fdw); uint8_t yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]; // could accomodate (image_width*image_width) YUV pixels yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0'; // this NULL character is only to act as "stop bit" for character array rd_buf = yuv_stream; printf("before allread() \n"); allread(fdr, rd_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); printf("after allread() \n"); close(fdr); }
Top