problem with user_w_write_128_wren

Questions and discussions about the Xillybus IP core and drivers

Re: problem with user_w_write_128_wren

Postby kevin »

the code is stucked at for https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754#file-host-cpp-L273 forever ....

Why doesn't the read() function return ? Did I use fork() incorrectly ?

before read()
after read()
num_of_pixels_received = 415
before read()
after read()
num_of_pixels_received = 420
before read()
after read()
num_of_pixels_received = 425
before read()
after read()
num_of_pixels_received = 430
before read()
after read()
num_of_pixels_received = 435
before read()
after read()
num_of_pixels_received = 440
before read()
*** Write process enters waiting status .....


Code: Select all
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;

//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   int Y_temp, U_temp, V_temp;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   // https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
   Y_temp =  77*R + 150*G +  29*B;
   U_temp = -43*R -  84*G + 127*B;
   V_temp = 127*R - 106*G -  21*B;

   Y_temp = (Y_temp + 128) >> 8;
   U_temp = (U_temp + 128) >> 8;
   V_temp = (V_temp + 128) >> 8;

   yuv_result->Y = Y_temp;
   yuv_result->U = U_temp + 128;
   yuv_result->V = V_temp + 128;

   return yuv_result;
}

/*
   Plain write() may not write all bytes requested in the buffer, so
   allwrite() loops until all data was indeed written, or exits in
   case of failure, except for EINTR. The way the EINTR condition is
   handled is the standard way of making sure the process can be suspended
   with CTRL-Z and then continue running properly.

   The function has no return value, because it always succeeds (or exits
   instead of returning).

   The function doesn't expect to reach EOF either.
*/

int allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
  return sent;
}

int main(int argc, char *argv[]) {

  int fdr, fdw, rd, wr, rd_donebytes, wr_donebytes;
  int wait_status; // for wait()
  uint8_t *wr_buf, *rd_buf;
  pid_t pid;
  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }

  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);

    wr_donebytes = 0;  // this variable includes the empty 8 bits for the MSB
   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the empty 8 bits
   //unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; //printf("if_index = %d\n\r", if_index);
      //if(if_index == 3) break;

      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break;
      }*/

      wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte

        wr = allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
      //printf("wr = %d\n", wr);
      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
      //printf("num_of_pixels_sent = %d\n", num_of_pixels_sent);
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }*/
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      wr = allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1);
      //printf("wr = %d\n", wr);
      break;  // finish sending all (image_width * image_height) pixels
     }

     if ((wr < 0) && (errno == EINTR))
      continue;

     if (wr <= 0) {
      perror("write() failed");
      exit(1);
     }

     wr_donebytes += wr;
   }

   write(fdw, NULL, 0); // flush the write stream

   close(fdw);

   printf("*** Write process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** write process detects read process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

   return 0;
  }

  else {
    close(fdw);

   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

    //rd_buf = (uint8_t *) fromlogic; // yuv_stream
    rd_donebytes = 0;  // this variable includes the empty 8 bits for the MSB

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        rd_buf = yuv_stream;
      printf("before read() \n");
      rd = read(fdr, rd_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);
      printf("after read() \n");
      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      /*for(yuv_stream_index = 0; yuv_stream_index<STREAM_WIDTH/NUM_OF_BITS_PER_BYTE; yuv_stream_index=yuv_stream_index+1)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, yuv_stream[yuv_stream_index]);
      }*/
      yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // this NULL character is only to act as "stop bit" for character array

      //if(num_of_pixels_received == 300) break; // just to test if there is actually something being read, or returned from hardware

      // store the calculated output YUV pixels into fromlogic such that we could reconstruct the image in its original dimension for visual display
      for(yuv_stream_index = 0; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y = yuv_stream[yuv_stream_index];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U = yuv_stream[yuv_stream_index+1];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V = yuv_stream[yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1];
      }

      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        rd_buf = yuv_stream;
        printf("before read in else. \n");
      rd = read(fdr, rd_buf, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }

      if ((rd < 0) && (errno == EINTR))
      continue;

      if (rd < 0) {
      perror("read() failed");
      exit(1);
      }

      if (rd == 0) {
      fprintf(stderr, "Reached read EOF!? Should never happen.\n");
      exit(0);
      }

      rd_donebytes += rd;
   }
   printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    close(fdr);

   printf("*** Read process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** read process detects write process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

    return 0;
  }

  /*pid = wait(&wait_status);
  printf("*** Parent detects process %d is done ***\n", pid);
  printf("*** Parent exits ***\n");*/
  exit(0);
}
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby kevin »

Commenting out this line https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754#file-host-cpp-L17 which is similar to using "write() first then read()" in totally sequential manner also yields the same issue of non-returning read().

I am assuming that FPGA isn't sending as much data as we expect, so read() blocks waiting for more.
it could be due to the size of my fpga hardware fifo , but I am bit sceptical. Note that my xillydemo.v is at https://gist.github.com/promach/a3af6c59906567c3df4179a501513a1b

Anyone have any comment about this OR have similar prior experience ?

Code: Select all
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;
#define FORK 1
//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   int Y_temp, U_temp, V_temp;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   // https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
   Y_temp =  77*R + 150*G +  29*B;
   U_temp = -43*R -  84*G + 127*B;
   V_temp = 127*R - 106*G -  21*B;

   Y_temp = (Y_temp + 128) >> 8;
   U_temp = (U_temp + 128) >> 8;
   V_temp = (V_temp + 128) >> 8;

   yuv_result->Y = Y_temp;
   yuv_result->U = U_temp + 128;
   yuv_result->V = V_temp + 128;

   return yuv_result;
}

/*
   Plain write() may not write all bytes requested in the buffer, so
   allwrite() loops until all data was indeed written, or exits in
   case of failure, except for EINTR. The way the EINTR condition is
   handled is the standard way of making sure the process can be suspended
   with CTRL-Z and then continue running properly.

   The function has no return value, because it always succeeds (or exits
   instead of returning).

   The function doesn't expect to reach EOF either.
*/

int allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
  return sent;
}

int main(int argc, char *argv[]) {

  int fdr, fdw, rd, wr, rd_donebytes, wr_donebytes;
  uint8_t *wr_buf, *rd_buf;

  #ifdef FORK
  int wait_status; // for wait()
  pid_t pid;
  #endif

  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }
#ifdef FORK
  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);
#endif
    wr_donebytes = 0;  // this variable includes the empty 8 bits for the MSB
   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the empty 8 bits
   //unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; //printf("if_index = %d\n\r", if_index);
      //if(if_index == 3) break;

      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break;
      }*/

      wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte

        wr = allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
      //printf("wr = %d\n", wr);
      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
      //printf("num_of_pixels_sent = %d\n", num_of_pixels_sent);
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }*/
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      wr = allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1);
      //printf("wr = %d\n", wr);
      break;  // finish sending all (image_width * image_height) pixels
     }

     if ((wr < 0) && (errno == EINTR))
      continue;

     if (wr <= 0) {
      perror("write() failed");
      exit(1);
     }

     wr_donebytes += wr;
   }

   write(fdw, NULL, 0); // flush the write stream

   close(fdw);
#ifdef FORK
   printf("*** Write process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** write process detects read process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

   return 0;
  }

  else {
    close(fdw);
#endif
   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

    //rd_buf = (uint8_t *) fromlogic; // yuv_stream
    rd_donebytes = 0;  // this variable includes the empty 8 bits for the MSB

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        rd_buf = yuv_stream;
      printf("before read() \n");
      rd = read(fdr, rd_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);
      printf("after read() \n");
      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      /*for(yuv_stream_index = 0; yuv_stream_index<STREAM_WIDTH/NUM_OF_BITS_PER_BYTE; yuv_stream_index=yuv_stream_index+1)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, yuv_stream[yuv_stream_index]);
      }*/
      yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // this NULL character is only to act as "stop bit" for character array

      //if(num_of_pixels_received == 300) break; // just to test if there is actually something being read, or returned from hardware

      // store the calculated output YUV pixels into fromlogic such that we could reconstruct the image in its original dimension for visual display
      for(yuv_stream_index = 0; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y = yuv_stream[yuv_stream_index];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U = yuv_stream[yuv_stream_index+1];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V = yuv_stream[yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1];
      }

      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        rd_buf = yuv_stream;
        printf("before read in else. \n");
      rd = read(fdr, rd_buf, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }

      if ((rd < 0) && (errno == EINTR))
      continue;

      if (rd < 0) {
      perror("read() failed");
      exit(1);
      }

      if (rd == 0) {
      fprintf(stderr, "Reached read EOF!? Should never happen.\n");
      exit(0);
      }

      rd_donebytes += rd;
   }
   printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    close(fdr);
#ifdef FORK
   printf("*** Read process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** read process detects write process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

    return 0;
  }
#endif
  /*pid = wait(&wait_status);
  printf("*** Parent detects process %d is done ***\n", pid);
  printf("*** Parent exits ***\n");*/
  exit(0);
}


Code: Select all
//`define LOOPBACK 1

module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P);

    localparam STREAM_WIDTH = 128;
 
   input  PCIE_PERST_B_LS;
   input  PCIE_REFCLK_N;
   input  PCIE_REFCLK_P;
   input [7:0] PCIE_RX_N;
   input [7:0] PCIE_RX_P;
   output [3:0] GPIO_LED;
   output [7:0] PCIE_TX_N;
   output [7:0] PCIE_TX_P;
   
   // Clock and quiesce
   wire    bus_clk;
   wire    quiesce;
   
   // Memory array
   reg [7:0]    demoarray[0:31];

   
   // Wires related to /dev/xillybus_mem_128
   wire       user_r_mem_128_rden;
   wire       user_r_mem_128_empty;
   reg [STREAM_WIDTH-1:0]  user_r_mem_128_data;
   wire       user_r_mem_128_eof;
   wire       user_r_mem_128_open;
   wire       user_w_mem_128_wren;
   wire       user_w_mem_128_full;
   wire [STREAM_WIDTH-1:0] user_w_mem_128_data;
   wire       user_w_mem_128_open;
   wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr;
   wire       user_mem_128_addr_update;

  // Wires related to /dev/xillybus_read_128
  wire  user_r_read_128_rden;
  wire  user_r_read_128_empty;
  wire [STREAM_WIDTH-1:0] user_r_read_128_data;
  wire  user_r_read_128_eof;
  wire  user_r_read_128_open;

  // Wires related to /dev/xillybus_write_128
  wire  user_w_write_128_wren;
  wire  user_w_write_128_full;
  wire [STREAM_WIDTH-1:0] user_w_write_128_data;
  wire  user_w_write_128_open;

   // Wires related to /dev/xillybus_read_256
   wire       user_r_read_256_rden;
   wire       user_r_read_256_empty;
   wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data;
   wire        user_r_read_256_eof;
   wire        user_r_read_256_open;

   // Wires related to /dev/xillybus_write_256
   wire        user_w_write_256_wren;
   wire        user_w_write_256_full;
   wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data;
   wire        user_w_write_256_open;


   xillybus xillybus_ins (

           // Ports related to /dev/xillybus_mem_128
           // FPGA to CPU signals:
           .user_r_mem_128_rden(user_r_mem_128_rden),
           .user_r_mem_128_empty(user_r_mem_128_empty),
           .user_r_mem_128_data(user_r_mem_128_data),
           .user_r_mem_128_eof(user_r_mem_128_eof),
           .user_r_mem_128_open(user_r_mem_128_open),

           // CPU to FPGA signals:
           .user_w_mem_128_wren(user_w_mem_128_wren),
           .user_w_mem_128_full(user_w_mem_128_full),
           .user_w_mem_128_data(user_w_mem_128_data),
           .user_w_mem_128_open(user_w_mem_128_open),

           // Address signals:
           .user_mem_128_addr(user_mem_128_addr),
           .user_mem_128_addr_update(user_mem_128_addr_update),


           // Ports related to /dev/xillybus_read_256
           // FPGA to CPU signals:
           .user_r_read_256_rden(user_r_read_256_rden),
           .user_r_read_256_empty(user_r_read_256_empty),
           .user_r_read_256_data(user_r_read_256_data),
           .user_r_read_256_eof(user_r_read_256_eof),
           .user_r_read_256_open(user_r_read_256_open),

           // Ports related to /dev/xillybus_write_256
           // CPU to FPGA signals:
           .user_w_write_256_wren(user_w_write_256_wren),
           .user_w_write_256_full(user_w_write_256_full),
           .user_w_write_256_data(user_w_write_256_data),
           .user_w_write_256_open(user_w_write_256_open),

           // Ports related to /dev/xillybus_read_128
           // FPGA to CPU signals:
           .user_r_read_128_rden(user_r_read_128_rden),
           .user_r_read_128_empty(user_r_read_128_empty),
           .user_r_read_128_data(user_r_read_128_data),
           .user_r_read_128_eof(user_r_read_128_eof),
           .user_r_read_128_open(user_r_read_128_open),

           // Ports related to /dev/xillybus_write_128
           // CPU to FPGA signals:
           .user_w_write_128_wren(user_w_write_128_wren),
           .user_w_write_128_full(user_w_write_128_full),
           .user_w_write_128_data(user_w_write_128_data),
           .user_w_write_128_open(user_w_write_128_open),


           // Signals to top level
           .PCIE_PERST_B_LS(PCIE_PERST_B_LS),
           .PCIE_REFCLK_N(PCIE_REFCLK_N),
           .PCIE_REFCLK_P(PCIE_REFCLK_P),
           .PCIE_RX_N(PCIE_RX_N),
           .PCIE_RX_P(PCIE_RX_P),
           .GPIO_LED(GPIO_LED),
           .PCIE_TX_N(PCIE_TX_N),
           .PCIE_TX_P(PCIE_TX_P),
           .bus_clk(bus_clk),
           .quiesce(quiesce)
           );

   // A simple inferred RAM
   always @(posedge bus_clk)
     begin
   if (user_w_mem_128_wren)
     demoarray[user_mem_128_addr] <= user_w_mem_128_data;
   
   if (user_r_mem_128_rden)
     user_r_mem_128_data <= demoarray[user_mem_128_addr];    
     end

   assign  user_r_mem_128_empty = 0;
   assign  user_r_mem_128_eof = 0;
   assign  user_w_mem_128_full = 0;

//`ifdef LOOPBACK

  //wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo;

  // 128-bit loopback
  /* fifo_128 fifo_128x128
     (
      .clk(bus_clk),
      .reset(!user_w_write_128_open && !user_r_read_128_open),
      .flush_en(0),
      .value_i(user_w_write_128_data),
      .enqueue_en(user_w_write_128_wren),
      .dequeue_en(user_r_read_128_rden),
      .value_o(user_r_read_128_data),
      .full(user_w_write_128_full),
      .empty(user_r_read_128_empty),
      .count(data_count_of_loopback_fifo)
      );
   
   

   assign  user_r_read_128_eof = 0;*/
   
//`else
// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels
// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.
// A pixel occupies 3*8=24 bits.  Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128,
// computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128

   
    localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]  , output:[Y, U, V]
    localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
    localparam KERNEL_NUM = 5;  // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel
   

// Signals for two buffer FIFOs 
    localparam FIFO_DEPTH = 16;
   
    wire   [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo;  // determines whether all five pixel slots have incoming data or not
    wire   [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo;
   
//-------------------------------------------kernel----------------------------------------//
   
    wire   [STREAM_WIDTH-1:0] stream_i_V_V_dout;  // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_i_V_V_empty;  // Empty condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active
   
    wire   [STREAM_WIDTH-1:0] stream_o_V_V_din;  // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_o_V_V_full;  // Full condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write;  // Write enable for each color components of all five pixels, high active

    wire   is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren);  // the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time) AND the input FIFO is outputting pixels

    reg    [KERNEL_NUM-1:0] ap_start = 0;  // initially the HLS kernels are not started   
   wire   [KERNEL_NUM-1:0] ap_done;
    wire   [KERNEL_NUM-1:0] ap_idle;
    wire   [KERNEL_NUM-1:0] ap_ready;

    always @(posedge bus_clk)
        ap_start <= (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not     

// -----------------input FIFO ----------------------------------//

   fifo_fwft_128
   #(
        .WIDTH(STREAM_WIDTH),
        .SIZE(FIFO_DEPTH)
   )
   input_pipe(
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(user_w_write_128_data),
         .enqueue_en(user_w_write_128_wren),
         .dequeue_en(&stream_i_V_V_read),
         .value_o(stream_i_V_V_dout),
         .full(user_w_write_128_full),
         .empty(stream_i_V_V_empty),
         .count(data_count_of_input_fifo)           
   );

// use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels

   generate
        genvar kn;  // to indicate which kernel
       
        for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin       
           kernel RGB2YUV_kn (
                   .ap_clk(bus_clk),
                   .ap_rst(!user_w_write_128_open && !user_r_read_128_open),
                   .ap_start(ap_start[kn]),  // need to confirm ?
                   .ap_done(ap_done[kn]),
                   .ap_idle(ap_idle[kn]),
                   .ap_ready(ap_ready[kn]),
                   .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // input component R with (PIXEL_VALUE_RANGE) bits
                   .stream_i0_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // input component G with (PIXEL_VALUE_RANGE) bits
                   .stream_i1_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // input component B with (PIXEL_VALUE_RANGE) bits
                   .stream_i2_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]),
                   .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // output component Y with (PIXEL_VALUE_RANGE) bits
                   .stream_o0_V_V_full_n(!stream_o_V_V_full),
                   .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // output component U with (PIXEL_VALUE_RANGE) bits
                   .stream_o1_V_V_full_n(!stream_o_V_V_full),
                   .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // output component V with (PIXEL_VALUE_RANGE) bits
                   .stream_o2_V_V_full_n(!stream_o_V_V_full),
                   .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)])
            );
        end
    endgenerate

    assign stream_o_V_V_din[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE] = 0;  // (note that we neglected the most significant 8 bits)

//----------------------output FIFO-----------------------------//   
    fifo_128
    #(
         .WIDTH(STREAM_WIDTH),
         .SIZE(FIFO_DEPTH)
    ) 
    output_pipe (
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(stream_o_V_V_din),
         .enqueue_en(&stream_o_V_V_write),
         .dequeue_en(user_r_read_128_rden),
         .value_o(user_r_read_128_data),
         .full(stream_o_V_V_full),
         .empty(user_r_read_128_empty),
         .count(data_count_of_output_fifo)   
   );
   
   assign  user_r_read_128_eof = 0;

    // Vivado built-in internal logic analyzer module instantiation
   
    ila_0 ila(
        .clk(bus_clk),
        .probe0(user_w_write_128_data),
        .probe1(stream_i_V_V_dout),
        .probe2(stream_o_V_V_din),
        .probe3(user_r_read_128_data),
        .probe4(stream_i_V_V_read), 
        .probe5(stream_o_V_V_write),
        .probe6(data_count_of_input_fifo),
        .probe7(data_count_of_output_fifo),
        .probe8(user_w_write_128_full),
        .probe9(stream_i_V_V_empty),
        .probe10(user_w_write_128_wren),
        .probe11(user_r_read_128_rden),
        .probe12(stream_o_V_V_full),
        .probe13(user_r_read_128_empty),
        .probe14(user_w_write_128_open),
        .probe15(user_r_read_128_open),
        .probe16(ap_start),
        .probe17(ap_done),
        .probe18(ap_idle),
        .probe19(ap_ready),
        .probe20(is_last_few_pixels)
    );
//`endif
   
endmodule
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby kevin »

The intermediate root cause is due to user_r_read_128_rden signal which does not get asserted HIGH anymore after some time even though the output FIFO is holding so much data. Could anyone advise ? Is this a bug within verilog or cpp coding ?

Image
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby kevin »

I forgot to upload the ILA waveform I posted in the post above.

Here you go. There are two files in this gist https://gist.github.com/promach/92b624787edd1c9178c32839fe38736e . Please directly open rgb2yuv_ila_data_file.gtkw using gtkwave software.
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby kevin »

Sorry I am wrong about user_r_read_128_rden.

It is due to user_w_write_128_wren signal which does not get asserted HIGH even though there are many more pixels bytes fed into the write() in the cpp coding.

I am already using the recommeded allwrite() as well as explicit flush as you can see in https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754#file-host-cpp-L272

Where else is wrong ?
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby support »

Hello,

It seems like you're not quite on top of the issue of how read() and write() behave, and how these calls should be used to ensure a chunk of data has been read or written.

It's not clear to me why you've turned allwrite() from a void function to an int. It will never return unless all data that has been requested has been written successfully. If something goes wrong, it exit()s. I mean, there a while (sent < len) loop, with no "break" in it, so it must quit on sent >= len. Which effectively means sent == len, and that brings the question on why returning the value. I don't care about the wasted CPU cycles, but this mistake shows me that you've missed the point in the field that you have trouble.

As for the read(), it seems like your code relies on that the number of bytes that are read are the number requested. So if you do, I suggest writing a counterpart allread() function, and use it instead of just read(). Exactly like allwrite().

Regards,
Eli
support
 
Posts: 802
Joined:

Re: problem with user_w_write_128_wren

Postby kevin »

allread() also does not help. Could anyone advise ?

https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754

Note that if I don't disable fork, I received way far less number of pixels.

before read()
after read()
num_of_pixels_received = 40910
before read()
after read()
num_of_pixels_received = 40915
before read()
after read()
num_of_pixels_received = 40920
before read()
after read()
num_of_pixels_received = 40925
before read()
after read()
num_of_pixels_received = 40930
before read()
after read()
num_of_pixels_received = 40935
before read()
after read()
num_of_pixels_received = 40940
before read()


Code: Select all
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;
//#define FORK 1
//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   int Y_temp, U_temp, V_temp;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   // https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
   Y_temp =  77*R + 150*G +  29*B;
   U_temp = -43*R -  84*G + 127*B;
   V_temp = 127*R - 106*G -  21*B;

   Y_temp = (Y_temp + 128) >> 8;
   U_temp = (U_temp + 128) >> 8;
   V_temp = (V_temp + 128) >> 8;

   yuv_result->Y = Y_temp;
   yuv_result->U = U_temp + 128;
   yuv_result->V = V_temp + 128;

   return yuv_result;
}

/*
   Plain write() may not write all bytes requested in the buffer, so
   allwrite() loops until all data was indeed written, or exits in
   case of failure, except for EINTR. The way the EINTR condition is
   handled is the standard way of making sure the process can be suspended
   with CTRL-Z and then continue running properly.

   The function has no return value, because it always succeeds (or exits
   instead of returning).

   The function doesn't expect to reach EOF either.
*/

void allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
  //return sent;
}

void allread(int fd, unsigned char *buf, int len) {
  int recvd = 0;
  int rc;

  while (recvd < len) {
    rc = read(fd, buf + recvd, len - recvd);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    recvd += rc;
  }
  //return sent;
}

int main(int argc, char *argv[]) {

  int fdr, fdw;
  uint8_t *wr_buf, *rd_buf;

  #ifdef FORK
  int wait_status; // for wait()
  pid_t pid;
  #endif

  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }
#ifdef FORK
  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);
#endif

   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the empty 8 bits
   //unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; //printf("if_index = %d\n\r", if_index);
      //if(if_index == 3) break;

      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break;
      }*/

      wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte

        allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
      //printf("wr = %d\n", wr);
      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
      //printf("num_of_pixels_sent = %d\n", num_of_pixels_sent);
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }*/
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1);
      //printf("wr = %d\n", wr);
      break;  // finish sending all (image_width * image_height) pixels
     }
   }

   write(fdw, NULL, 0); // flush the write stream

   close(fdw);
#ifdef FORK
   printf("*** Write process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** write process detects read process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

   return 0;
  }

  else {
    close(fdw);
#endif
   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        rd_buf = yuv_stream;
      printf("before read() \n");
      allread(fdr, rd_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);
      printf("after read() \n");
      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      /*for(yuv_stream_index = 0; yuv_stream_index<STREAM_WIDTH/NUM_OF_BITS_PER_BYTE; yuv_stream_index=yuv_stream_index+1)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, yuv_stream[yuv_stream_index]);
      }*/
      yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // this NULL character is only to act as "stop bit" for character array

      //if(num_of_pixels_received == 300) break; // just to test if there is actually something being read, or returned from hardware

      // store the calculated output YUV pixels into fromlogic such that we could reconstruct the image in its original dimension for visual display
      for(yuv_stream_index = 0; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y = yuv_stream[yuv_stream_index];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U = yuv_stream[yuv_stream_index+1];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V = yuv_stream[yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1];
      }

      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        rd_buf = yuv_stream;
        printf("before read in else. \n");
      allread(fdr, rd_buf, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }
   }
   printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    close(fdr);
#ifdef FORK
   printf("*** Read process enters waiting status .....\n");

   pid = wait(&wait_status);
   printf("*** read process detects write process with pid %d was done ***\n", pid);  // most probably write process will be done first, since FPGA computation takes a few clock cyles

    return 0;
  }
#endif
  /*pid = wait(&wait_status);
  printf("*** Parent detects process %d is done ***\n", pid);
  printf("*** Parent exits ***\n");*/
  exit(0);
}
kevin
 
Posts: 43
Joined:

Re: problem with user_w_write_128_wren

Postby Guest »

It seems to me that allwrite() function does not write all 512*512=262144 pixels, only 40960 pixels arrived at the fpga hardware user logic input fifo , and only 40940 pixels departed from fpga hardware user logic output fifo.

May I know why ?

https://gist.github.com/promach/a3af6c59906567c3df4179a501513a1b#file-xillydemo-v-L224-L233

Code: Select all
//`define LOOPBACK 1

module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P);

    localparam STREAM_WIDTH = 128;
 
   input  PCIE_PERST_B_LS;
   input  PCIE_REFCLK_N;
   input  PCIE_REFCLK_P;
   input [7:0] PCIE_RX_N;
   input [7:0] PCIE_RX_P;
   output [3:0] GPIO_LED;
   output [7:0] PCIE_TX_N;
   output [7:0] PCIE_TX_P;
   
   // Clock and quiesce
   wire    bus_clk;
   wire    quiesce;
   
   // Memory array
   reg [7:0]    demoarray[0:31];

   
   // Wires related to /dev/xillybus_mem_128
   wire       user_r_mem_128_rden;
   wire       user_r_mem_128_empty;
   reg [STREAM_WIDTH-1:0]  user_r_mem_128_data;
   wire       user_r_mem_128_eof;
   wire       user_r_mem_128_open;
   wire       user_w_mem_128_wren;
   wire       user_w_mem_128_full;
   wire [STREAM_WIDTH-1:0] user_w_mem_128_data;
   wire       user_w_mem_128_open;
   wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr;
   wire       user_mem_128_addr_update;

  // Wires related to /dev/xillybus_read_128
  wire  user_r_read_128_rden;
  wire  user_r_read_128_empty;
  wire [STREAM_WIDTH-1:0] user_r_read_128_data;
  wire  user_r_read_128_eof;
  wire  user_r_read_128_open;

  // Wires related to /dev/xillybus_write_128
  wire  user_w_write_128_wren;
  wire  user_w_write_128_full;
  wire [STREAM_WIDTH-1:0] user_w_write_128_data;
  wire  user_w_write_128_open;

   // Wires related to /dev/xillybus_read_256
   wire       user_r_read_256_rden;
   wire       user_r_read_256_empty;
   wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data;
   wire        user_r_read_256_eof;
   wire        user_r_read_256_open;

   // Wires related to /dev/xillybus_write_256
   wire        user_w_write_256_wren;
   wire        user_w_write_256_full;
   wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data;
   wire        user_w_write_256_open;


   xillybus xillybus_ins (

           // Ports related to /dev/xillybus_mem_128
           // FPGA to CPU signals:
           .user_r_mem_128_rden(user_r_mem_128_rden),
           .user_r_mem_128_empty(user_r_mem_128_empty),
           .user_r_mem_128_data(user_r_mem_128_data),
           .user_r_mem_128_eof(user_r_mem_128_eof),
           .user_r_mem_128_open(user_r_mem_128_open),

           // CPU to FPGA signals:
           .user_w_mem_128_wren(user_w_mem_128_wren),
           .user_w_mem_128_full(user_w_mem_128_full),
           .user_w_mem_128_data(user_w_mem_128_data),
           .user_w_mem_128_open(user_w_mem_128_open),

           // Address signals:
           .user_mem_128_addr(user_mem_128_addr),
           .user_mem_128_addr_update(user_mem_128_addr_update),


           // Ports related to /dev/xillybus_read_256
           // FPGA to CPU signals:
           .user_r_read_256_rden(user_r_read_256_rden),
           .user_r_read_256_empty(user_r_read_256_empty),
           .user_r_read_256_data(user_r_read_256_data),
           .user_r_read_256_eof(user_r_read_256_eof),
           .user_r_read_256_open(user_r_read_256_open),

           // Ports related to /dev/xillybus_write_256
           // CPU to FPGA signals:
           .user_w_write_256_wren(user_w_write_256_wren),
           .user_w_write_256_full(user_w_write_256_full),
           .user_w_write_256_data(user_w_write_256_data),
           .user_w_write_256_open(user_w_write_256_open),

           // Ports related to /dev/xillybus_read_128
           // FPGA to CPU signals:
           .user_r_read_128_rden(user_r_read_128_rden),
           .user_r_read_128_empty(user_r_read_128_empty),
           .user_r_read_128_data(user_r_read_128_data),
           .user_r_read_128_eof(user_r_read_128_eof),
           .user_r_read_128_open(user_r_read_128_open),

           // Ports related to /dev/xillybus_write_128
           // CPU to FPGA signals:
           .user_w_write_128_wren(user_w_write_128_wren),
           .user_w_write_128_full(user_w_write_128_full),
           .user_w_write_128_data(user_w_write_128_data),
           .user_w_write_128_open(user_w_write_128_open),


           // Signals to top level
           .PCIE_PERST_B_LS(PCIE_PERST_B_LS),
           .PCIE_REFCLK_N(PCIE_REFCLK_N),
           .PCIE_REFCLK_P(PCIE_REFCLK_P),
           .PCIE_RX_N(PCIE_RX_N),
           .PCIE_RX_P(PCIE_RX_P),
           .GPIO_LED(GPIO_LED),
           .PCIE_TX_N(PCIE_TX_N),
           .PCIE_TX_P(PCIE_TX_P),
           .bus_clk(bus_clk),
           .quiesce(quiesce)
           );

   // A simple inferred RAM
   always @(posedge bus_clk)
     begin
   if (user_w_mem_128_wren)
     demoarray[user_mem_128_addr] <= user_w_mem_128_data;
   
   if (user_r_mem_128_rden)
     user_r_mem_128_data <= demoarray[user_mem_128_addr];    
     end

   assign  user_r_mem_128_empty = 0;
   assign  user_r_mem_128_eof = 0;
   assign  user_w_mem_128_full = 0;

//`ifdef LOOPBACK

  //wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo;

  // 128-bit loopback
  /* fifo_128 fifo_128x128
     (
      .clk(bus_clk),
      .reset(!user_w_write_128_open && !user_r_read_128_open),
      .flush_en(0),
      .value_i(user_w_write_128_data),
      .enqueue_en(user_w_write_128_wren),
      .dequeue_en(user_r_read_128_rden),
      .value_o(user_r_read_128_data),
      .full(user_w_write_128_full),
      .empty(user_r_read_128_empty),
      .count(data_count_of_loopback_fifo)
      );
   
   

   assign  user_r_read_128_eof = 0;*/
   
//`else
// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels
// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.
// A pixel occupies 3*8=24 bits.  Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128,
// computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128

   
    localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]  , output:[Y, U, V]
    localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
    localparam KERNEL_NUM = 5;  // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel
    localparam TOTAL_NUM_OF_PIXELS = 512*512; // lena.tiff is sized as 3*512*512 , width=512 and height=512

// Signals for two buffer FIFOs 
    localparam FIFO_DEPTH = 16;
   
    wire   [$clog2(FIFO_DEPTH):0] data_count_of_input_fifo;  // determines whether all five pixel slots have incoming data or not
    wire   [$clog2(FIFO_DEPTH):0] data_count_of_output_fifo;
   
//-------------------------------------------kernel----------------------------------------//
   
    wire   [STREAM_WIDTH-1:0] stream_i_V_V_dout;  // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_i_V_V_empty;  // Empty condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active
   
    wire   [STREAM_WIDTH-1:0] stream_o_V_V_din;  // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_o_V_V_full;  // Full condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write;  // Write enable for each color components of all five pixels, high active

    wire   is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren);  // the remaining pixels do not fill all five pixel slots for a 128-bit stream, AND the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time) AND the input FIFO is outputting pixels

    reg    [KERNEL_NUM-1:0] ap_start = 0;  // initially the HLS kernels are not started   
   wire   [KERNEL_NUM-1:0] ap_done;
    wire   [KERNEL_NUM-1:0] ap_idle;
    wire   [KERNEL_NUM-1:0] ap_ready;

    always @(posedge bus_clk)
        ap_start <= (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{(&stream_i_V_V_read || !stream_i_V_V_empty)}}; // start signals depend on whether all five pixel slots are filled or not     

// -----------------input FIFO ----------------------------------//

   fifo_fwft_128
   #(
        .WIDTH(STREAM_WIDTH),
        .SIZE(FIFO_DEPTH)
   )
   input_pipe(
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(user_w_write_128_data),
         .enqueue_en(user_w_write_128_wren),
         .dequeue_en(&stream_i_V_V_read),
         .value_o(stream_i_V_V_dout),
         .full(user_w_write_128_full),
         .empty(stream_i_V_V_empty),
         .count(data_count_of_input_fifo)           
   );

   // to check if xillybus has transmitted all pixels data to the input_pipe fifo
   reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_received_by_input_fifo = 0;  // initially nothing is received
   
   always@ (posedge bus_clk) begin
        if(!user_w_write_128_open && !user_r_read_128_open)
            number_of_pixels_received_by_input_fifo <= 0;
   
        else if(user_w_write_128_wren && (number_of_pixels_received_by_input_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM)))
            number_of_pixels_received_by_input_fifo <= number_of_pixels_received_by_input_fifo + KERNEL_NUM;  // for every xillybus transaction, input fifo should receive 'KERNEL_NUM' pieces of pixels
   end

// use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels

   generate
        genvar kn;  // to indicate which kernel
       
        for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin       
           kernel RGB2YUV_kn (
                   .ap_clk(bus_clk),
                   .ap_rst(!user_w_write_128_open && !user_r_read_128_open),
                   .ap_start(ap_start[kn]),  // need to confirm ?
                   .ap_done(ap_done[kn]),
                   .ap_idle(ap_idle[kn]),
                   .ap_ready(ap_ready[kn]),
                   .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // input component R with (PIXEL_VALUE_RANGE) bits
                   .stream_i0_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // input component G with (PIXEL_VALUE_RANGE) bits
                   .stream_i1_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // input component B with (PIXEL_VALUE_RANGE) bits
                   .stream_i2_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]),
                   .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // output component Y with (PIXEL_VALUE_RANGE) bits
                   .stream_o0_V_V_full_n(!stream_o_V_V_full),
                   .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // output component U with (PIXEL_VALUE_RANGE) bits
                   .stream_o1_V_V_full_n(!stream_o_V_V_full),
                   .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // output component V with (PIXEL_VALUE_RANGE) bits
                   .stream_o2_V_V_full_n(!stream_o_V_V_full),
                   .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)])
            );
        end
    endgenerate

    assign stream_o_V_V_din[STREAM_WIDTH-1 : (KERNEL_NUM-1)*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE] = 0;  // (note that we neglected the most significant 8 bits)

//----------------------output FIFO-----------------------------//   
    fifo_128
    #(
         .WIDTH(STREAM_WIDTH),
         .SIZE(FIFO_DEPTH)
    ) 
    output_pipe (
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(stream_o_V_V_din),
         .enqueue_en(&stream_o_V_V_write),
         .dequeue_en(user_r_read_128_rden),
         .value_o(user_r_read_128_data),
         .full(stream_o_V_V_full),
         .empty(user_r_read_128_empty),
         .count(data_count_of_output_fifo)   
   );

   // to check if xillybus has transmitted all pixels data from the output_pipe fifo
   reg [$clog2(TOTAL_NUM_OF_PIXELS)-1:0] number_of_pixels_sent_by_output_fifo = 0;  // initially nothing is sent
   
   always@ (posedge bus_clk) begin
        if(!user_w_write_128_open && !user_r_read_128_open)
            number_of_pixels_sent_by_output_fifo <= 0;
   
        else if(user_r_read_128_rden && (number_of_pixels_sent_by_output_fifo <= (TOTAL_NUM_OF_PIXELS-KERNEL_NUM)))
            number_of_pixels_sent_by_output_fifo <= number_of_pixels_sent_by_output_fifo + KERNEL_NUM;  // for every xillybus transaction, output fifo should send 'KERNEL_NUM' pieces of pixels
   end
   
   assign  user_r_read_128_eof = 0;

    // Vivado built-in internal logic analyzer module instantiation
   
    ila_0 ila(
        .clk(bus_clk),
        .probe0(user_w_write_128_data),
        .probe1(stream_i_V_V_dout),
        .probe2(stream_o_V_V_din),
        .probe3(user_r_read_128_data),
        .probe4(stream_i_V_V_read), 
        .probe5(stream_o_V_V_write),
        .probe6(data_count_of_input_fifo),
        .probe7(data_count_of_output_fifo),
        .probe8(user_w_write_128_full),
        .probe9(stream_i_V_V_empty),
        .probe10(user_w_write_128_wren),
        .probe11(user_r_read_128_rden),
        .probe12(stream_o_V_V_full),
        .probe13(user_r_read_128_empty),
        .probe14(user_w_write_128_open),
        .probe15(user_r_read_128_open),
        .probe16(ap_start),
        .probe17(ap_done),
        .probe18(ap_idle),
        .probe19(ap_ready),
        .probe20(is_last_few_pixels),
        .probe21(number_of_pixels_received_by_input_fifo),
        .probe22(number_of_pixels_sent_by_output_fifo)
    );
//`endif
   
endmodule


I have uploaded the ILA waveform trace at https://gist.github.com/promach/92b624787edd1c9178c32839fe38736e

Image
Guest
 

Re: problem with user_w_write_128_wren

Postby Guest »

with strace on , my cpp coding could only afford to receive 1725 pixels as shown in line 59524 of https://paste.ubuntu.com/p/J5kbTGr6Tm/

There is 100% a timing-related bug here, could anyone advise ?
Guest
 

Re: problem with user_w_write_128_wren

Postby Guest »

Using a single allwrite() and single allread() also does not work, I can receive way more than 40,000 pixels. However sometimes FPGA can receive 90 percent (240,000), sometimes 40 percent (100,000) of the total number of pixels.

But FPGA will NEVER receive 100 percent or all pixels. Note: I am sending all zeroes, which means I am sending a black picture.

Let me use the default loopback demo coding and see if this is really host cpp programming issue or verilog issue.

https://gist.githubusercontent.com/promach/9d185d35a6e6db0da10992a19c36f754/raw/d69c7de1178224afa0b370f2db6105b5a610f80d/test.cpp

Code: Select all
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address test.cpp -o test

#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>
#include <errno.h>

using namespace std;

const unsigned int image_width = 510;
const unsigned int image_height = 510;

const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

void allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
  //return sent;
}

void allread(int fd, unsigned char *buf, int len) {
  int recvd = 0;
  int rc;

  while (recvd < len) {
    rc = read(fd, buf + recvd, len - recvd);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    recvd += rc;
  }
  //return recvd;
}

int main() {

   int fdr, fdw;
   uint8_t *wr_buf, *rd_buf;

   fdr = open("/dev/xillybus_read_128", O_RDONLY);
   fdw = open("/dev/xillybus_write_128", O_WRONLY);

   if ((fdr < 0) || (fdw < 0)) {
       perror("Failed to open Xillybus device file(s)");
       exit(1);
   }      

   uint8_t rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1];  // could accomodate all (image_width*image_width) RGB pixels
   rgb_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0';  // however, this NULL character is not sent across write()

   for(unsigned int rgb_index=0; rgb_index<(image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_index++)
   {
      rgb_stream[rgb_index] = 0;  // send all zeroes to fpga
   }

   wr_buf = rgb_stream;
   allwrite(fdw, wr_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL);

   allwrite(fdw, NULL, 0); // flush the write stream
   printf("after allwrite() \n");
   close(fdw);

   uint8_t yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL + 1];  // could accomodate (image_width*image_width) YUV pixels
   yuv_stream[image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL] = '\0';  // this NULL character is only to act as "stop bit" for character array

   rd_buf = yuv_stream;
   printf("before allread() \n");
   allread(fdr, rd_buf, image_width*image_width*NUM_OF_COMPONENTS_IN_A_PIXEL);
   printf("after allread() \n");
   close(fdr);   
}
Guest
 

PreviousNext

Return to Xillybus