problem with user_w_write_128_wren

Questions and discussions about the Xillybus IP core and drivers

problem with user_w_write_128_wren

Postby kevin » Mon May 21, 2018 7:46 am

I am using revision XL for Ultrascal VCU108 board. With regards to https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754 and https://gist.github.com/promach/a3af6c59906567c3df4179a501513a1b , do you have any idea why is user_w_write_128_wren signal for the input FIFO is only asserted once as shown below ?

Code: Select all
//`define LOOPBACK 1

module xillydemo(PCIE_PERST_B_LS, PCIE_REFCLK_N, PCIE_REFCLK_P, PCIE_RX_N, PCIE_RX_P, GPIO_LED, PCIE_TX_N, PCIE_TX_P);

    localparam STREAM_WIDTH = 128;
 
   input  PCIE_PERST_B_LS;
   input  PCIE_REFCLK_N;
   input  PCIE_REFCLK_P;
   input [7:0] PCIE_RX_N;
   input [7:0] PCIE_RX_P;
   output [3:0] GPIO_LED;
   output [7:0] PCIE_TX_N;
   output [7:0] PCIE_TX_P;
   
   // Clock and quiesce
   wire    bus_clk;
   wire    quiesce;
   
   // Memory array
   reg [7:0]    demoarray[0:31];

   
   // Wires related to /dev/xillybus_mem_128
   wire       user_r_mem_128_rden;
   wire       user_r_mem_128_empty;
   reg [STREAM_WIDTH-1:0]  user_r_mem_128_data;
   wire       user_r_mem_128_eof;
   wire       user_r_mem_128_open;
   wire       user_w_mem_128_wren;
   wire       user_w_mem_128_full;
   wire [STREAM_WIDTH-1:0] user_w_mem_128_data;
   wire       user_w_mem_128_open;
   wire [$clog2(STREAM_WIDTH)-1:0] user_mem_128_addr;
   wire       user_mem_128_addr_update;

  // Wires related to /dev/xillybus_read_128
  wire  user_r_read_128_rden;
  wire  user_r_read_128_empty;
  wire [STREAM_WIDTH-1:0] user_r_read_128_data;
  wire  user_r_read_128_eof;
  wire  user_r_read_128_open;

  // Wires related to /dev/xillybus_write_128
  wire  user_w_write_128_wren;
  wire  user_w_write_128_full;
  wire [STREAM_WIDTH-1:0] user_w_write_128_data;
  wire  user_w_write_128_open;

   // Wires related to /dev/xillybus_read_256
   wire       user_r_read_256_rden;
   wire       user_r_read_256_empty;
   wire [(STREAM_WIDTH << 1)-1:0] user_r_read_256_data;
   wire        user_r_read_256_eof;
   wire        user_r_read_256_open;

   // Wires related to /dev/xillybus_write_256
   wire        user_w_write_256_wren;
   wire        user_w_write_256_full;
   wire [(STREAM_WIDTH << 1)-1:0] user_w_write_256_data;
   wire        user_w_write_256_open;


   xillybus xillybus_ins (

           // Ports related to /dev/xillybus_mem_128
           // FPGA to CPU signals:
           .user_r_mem_128_rden(user_r_mem_128_rden),
           .user_r_mem_128_empty(user_r_mem_128_empty),
           .user_r_mem_128_data(user_r_mem_128_data),
           .user_r_mem_128_eof(user_r_mem_128_eof),
           .user_r_mem_128_open(user_r_mem_128_open),

           // CPU to FPGA signals:
           .user_w_mem_128_wren(user_w_mem_128_wren),
           .user_w_mem_128_full(user_w_mem_128_full),
           .user_w_mem_128_data(user_w_mem_128_data),
           .user_w_mem_128_open(user_w_mem_128_open),

           // Address signals:
           .user_mem_128_addr(user_mem_128_addr),
           .user_mem_128_addr_update(user_mem_128_addr_update),


           // Ports related to /dev/xillybus_read_256
           // FPGA to CPU signals:
           .user_r_read_256_rden(user_r_read_256_rden),
           .user_r_read_256_empty(user_r_read_256_empty),
           .user_r_read_256_data(user_r_read_256_data),
           .user_r_read_256_eof(user_r_read_256_eof),
           .user_r_read_256_open(user_r_read_256_open),

           // Ports related to /dev/xillybus_write_256
           // CPU to FPGA signals:
           .user_w_write_256_wren(user_w_write_256_wren),
           .user_w_write_256_full(user_w_write_256_full),
           .user_w_write_256_data(user_w_write_256_data),
           .user_w_write_256_open(user_w_write_256_open),

           // Ports related to /dev/xillybus_read_128
           // FPGA to CPU signals:
           .user_r_read_128_rden(user_r_read_128_rden),
           .user_r_read_128_empty(user_r_read_128_empty),
           .user_r_read_128_data(user_r_read_128_data),
           .user_r_read_128_eof(user_r_read_128_eof),
           .user_r_read_128_open(user_r_read_128_open),

           // Ports related to /dev/xillybus_write_128
           // CPU to FPGA signals:
           .user_w_write_128_wren(user_w_write_128_wren),
           .user_w_write_128_full(user_w_write_128_full),
           .user_w_write_128_data(user_w_write_128_data),
           .user_w_write_128_open(user_w_write_128_open),


           // Signals to top level
           .PCIE_PERST_B_LS(PCIE_PERST_B_LS),
           .PCIE_REFCLK_N(PCIE_REFCLK_N),
           .PCIE_REFCLK_P(PCIE_REFCLK_P),
           .PCIE_RX_N(PCIE_RX_N),
           .PCIE_RX_P(PCIE_RX_P),
           .GPIO_LED(GPIO_LED),
           .PCIE_TX_N(PCIE_TX_N),
           .PCIE_TX_P(PCIE_TX_P),
           .bus_clk(bus_clk),
           .quiesce(quiesce)
           );

   // A simple inferred RAM
   always @(posedge bus_clk)
     begin
   if (user_w_mem_128_wren)
     demoarray[user_mem_128_addr] <= user_w_mem_128_data;
   
   if (user_r_mem_128_rden)
     user_r_mem_128_data <= demoarray[user_mem_128_addr];    
     end

   assign  user_r_mem_128_empty = 0;
   assign  user_r_mem_128_eof = 0;
   assign  user_w_mem_128_full = 0;

//`ifdef LOOPBACK

  wire [$clog2(STREAM_WIDTH)-1:0] data_count_of_loopback_fifo;

  // 128-bit loopback
  /* fifo_128 fifo_128x128
     (
      .clk(bus_clk),
      .reset(!user_w_write_128_open && !user_r_read_128_open),
      .flush_en(0),
      .value_i(user_w_write_128_data),
      .enqueue_en(user_w_write_128_wren),
      .dequeue_en(user_r_read_128_rden),
      .value_o(user_r_read_128_data),
      .full(user_w_write_128_full),
      .empty(user_r_read_128_empty),
      .count(data_count_of_loopback_fifo)
      );
   
   

   assign  user_r_read_128_eof = 0;*/
   
//`else
// Signals for ($floor((STREAM_WIDTH/PIXEL_VALUE_RANGE)/NUM_OF_COMPONENTS_IN_A_PIXEL) = 5) kernels
// since an image pixel is unsigned 8-bit integer, its component values of [R, G, B] or [Y, U, V] range from 0 to 255.
// A pixel occupies 3*8=24 bits.  Therefore, in each transaction, we could at most put 5 pixels (120 bits) into /dev/xillybus_write_128,
// computes the relevant kernel equations for 5 pixels, send out 5 pixels again through /dev/xillybus_read_128

   
    localparam NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]  , output:[Y, U, V]
    localparam PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] and [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
    localparam KERNEL_NUM = 5;  // 5 copies of kernel, each kernel computes equation for [R, G, B] of one single pixel
   

// Signals for two buffer FIFOs     
    wire   [$clog2(STREAM_WIDTH)-1:0] data_count_of_input_fifo;  // determines whether all five pixel slots have incoming data or not
    wire   [$clog2(STREAM_WIDTH)-1:0] data_count_of_output_fifo;
   
    wire   is_last_few_pixels = (data_count_of_input_fifo < KERNEL_NUM) && (!user_w_write_128_wren);  // the remaining pixels do not fill all five pixel slots for a 128-bit stream, and the input FIFO is not accepting any more pixels (in contrary when it is filling in the input FIFO at the initial time)


//-------------------------------------------kernel----------------------------------------//
   
    wire   [STREAM_WIDTH-1:0] stream_i_V_V_dout;  // Read data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_i_V_V_empty;  // Empty condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_i_V_V_read; // Read enable for each color components of all five pixels, high active
   
    wire   [STREAM_WIDTH-1:0] stream_o_V_V_din;  // Write data for 5 pixels or KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE = 120 bits (note that we neglected the most significant 8 bits)
    wire   stream_o_V_V_full;  // Full condition
    wire   [KERNEL_NUM*NUM_OF_COMPONENTS_IN_A_PIXEL-1:0] stream_o_V_V_write;  // Write enable for each color components of all five pixels, high active

    wire   [KERNEL_NUM-1:0] ap_start = (is_last_few_pixels) ? (data_count_of_input_fifo) : {KERNEL_NUM{1'b1}}; // start signals depend on whether all five pixel slots are filled or not         
   wire   [KERNEL_NUM-1:0] ap_done;
    wire   [KERNEL_NUM-1:0] ap_idle;
    wire   [KERNEL_NUM-1:0] ap_ready;


// -----------------input FIFO ----------------------------------//
   localparam FIFO_DEPTH = 16;

   fifo_128
   #(
        .WIDTH(STREAM_WIDTH),
        .SIZE(FIFO_DEPTH)
   )
   input_pipe(
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(user_w_write_128_data),
         .enqueue_en(user_w_write_128_wren),
         .dequeue_en(&stream_i_V_V_read),
         .value_o(stream_i_V_V_dout),
         .full(user_w_write_128_full),
         .empty(stream_i_V_V_empty),
         .count(data_count_of_input_fifo)           
   );

// use of generate loop to replicate 5 hardware copies of RGB2YUV computational HLS kernels for 5 different pixels

   generate
        genvar kn;  // to indicate which kernel
       
        for(kn=0; kn<KERNEL_NUM; kn=kn+1) begin       
           kernel RGB2YUV_kn (
                   .ap_clk(bus_clk),
                   .ap_rst(!user_w_write_128_open && !user_r_read_128_open),
                   .ap_start(ap_start[kn]),  // need to confirm ?
                   .ap_done(ap_done[kn]),
                   .ap_idle(ap_idle[kn]),
                   .ap_ready(ap_ready[kn]),
                   .stream_i0_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // input component R with (PIXEL_VALUE_RANGE) bits
                   .stream_i0_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i0_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_i1_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // input component G with (PIXEL_VALUE_RANGE) bits
                   .stream_i1_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i1_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_i2_V_V_dout(stream_i_V_V_dout[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // input component B with (PIXEL_VALUE_RANGE) bits
                   .stream_i2_V_V_empty_n(!stream_i_V_V_empty),
                   .stream_i2_V_V_read(stream_i_V_V_read[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)]),
                   .stream_o0_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE]),  // output component Y with (PIXEL_VALUE_RANGE) bits
                   .stream_o0_V_V_full_n(!stream_o_V_V_full),
                   .stream_o0_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL]),
                   .stream_o1_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + PIXEL_VALUE_RANGE]),  // output component U with (PIXEL_VALUE_RANGE) bits
                   .stream_o1_V_V_full_n(!stream_o_V_V_full),
                   .stream_o1_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + 1]),
                   .stream_o2_V_V_din(stream_o_V_V_din[kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE - 1 : kn*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)*PIXEL_VALUE_RANGE]),  // output component V with (PIXEL_VALUE_RANGE) bits
                   .stream_o2_V_V_full_n(!stream_o_V_V_full),
                   .stream_o2_V_V_write(stream_o_V_V_write[kn*NUM_OF_COMPONENTS_IN_A_PIXEL + (NUM_OF_COMPONENTS_IN_A_PIXEL-1)])
            );
        end
    endgenerate

//----------------------output FIFO-----------------------------//   
    fifo_128
    #(
         .WIDTH(STREAM_WIDTH),
         .SIZE(FIFO_DEPTH)
    ) 
    output_pipe (
         .clk(bus_clk),
         .reset(!user_w_write_128_open && !user_r_read_128_open),
         .flush_en(0),
         .value_i(stream_o_V_V_din),
         .enqueue_en(&stream_o_V_V_write),
         .dequeue_en(user_r_read_128_rden),
         .value_o(user_r_read_128_data),
         .full(stream_o_V_V_full),
         .empty(user_r_read_128_empty),
         .count(data_count_of_output_fifo)   
   );
   
   assign  user_r_read_128_eof = 0;

    // Vivado built-in internal logic analyzer module instantiation
   
    ila_0 ila(
        .clk(bus_clk),
        .probe0(user_w_write_128_data),
        .probe1(stream_i_V_V_dout),
        .probe2(stream_o_V_V_din),
        .probe3(user_r_read_128_data),
        .probe4(stream_i_V_V_read), 
        .probe5(stream_o_V_V_write),
        .probe6(data_count_of_input_fifo),
        .probe7(data_count_of_output_fifo),
        .probe8(user_w_write_128_full),
        .probe9(stream_i_V_V_empty),
        .probe10(user_w_write_128_wren),
        .probe11(user_r_read_128_rden),
        .probe12(stream_o_V_V_full),
        .probe13(user_r_read_128_empty),
        .probe14(user_w_write_128_open),
        .probe15(user_r_read_128_open),
        .probe16(ap_start),
        .probe17(ap_done),
        .probe18(ap_idle),
        .probe19(ap_ready),
        .probe20(is_last_few_pixels)
    );
//`endif
   
endmodule


Code: Select all
// g++ -g -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;

//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   yuv_result->Y = 0.299*R + 0.587*G + 0.114*B;
   yuv_result->U = 0.492*(B-yuv_result->Y);;
   yuv_result->V = 0.877*(R-yuv_result->Y);

   // https://www.pcmag.com/encyclopedia/term/55166/yuv-rgb-conversion-formulas

   return yuv_result;
}

int main(int argc, char *argv[]) {

  int fdr, fdw, rd, wr, rd_donebytes, wr_donebytes;
  uint8_t *wr_buf, *rd_buf;
  pid_t pid;
  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }

  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);

    wr_donebytes = 0;  // this variable includes the empty 8 bits for the MSB
   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the emtpy 8 bits
   unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; printf("if_index = %d\n\r", if_index);
      

      for(rgb_stream_index = 1; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
         rgb_stream[rgb_stream_index+1] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[0] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }

      wr_buf = rgb_stream;

        wr = write(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits

      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 1; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
         rgb_stream[rgb_stream_index+1] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[((rgb_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[0] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }*/
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      wr = write(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL);

      break;  // finish sending all (image_width * image_height) pixels
     }

     if ((wr < 0) && (errno == EINTR))
      continue;

     if (wr <= 0) {
      perror("write() failed");
      exit(1);
     }

     wr_donebytes += wr;
   }

   sleep(1); // Let debug output drain (if used)

   close(fdw);

   return 0;
  }

  else {
    close(fdw);

   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

    rd_buf = (uint8_t *) fromlogic;
    rd_donebytes = 0;  // this variable includes the empty 8 bits for the MSB

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t *yuv_stream;

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        yuv_stream = rd_buf + num_of_pixels_received;
        //printf("before read() \n");
      rd = read(fdr, yuv_stream, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);
      
      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      //printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      /*for(yuv_stream_index = 1; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_received)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V );
         printf("yuv_stream[%d] = %d\n", yuv_stream_index+1, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U );
         printf("yuv_stream[%d] = %d\n", yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y );

         //break; // just to test if there is actually something being read, or returned from hardware.
      }*/

      //break; // just to test if there is actually something being read, or returned from hardware.
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        yuv_stream = rd_buf + num_of_pixels_received;
        //printf("before read in else. \n");
      rd = read(fdr, yuv_stream, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }

      if ((rd < 0) && (errno == EINTR))
      continue;

      if (rd < 0) {
      perror("read() failed");
      exit(1);
      }

      if (rd == 0) {
      fprintf(stderr, "Reached read EOF!? Should never happen.\n");
      exit(0);
      }

      rd_donebytes += rd;
   }
   //printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    sleep(1); // Let debug output drain (if used)

    close(fdr);

    return 0;
  }
}


Image
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby support » Mon May 21, 2018 8:12 am

Hello,

As I've mentioned on another thread (viewtopic.php?f=4&t=585) you must always check how many bytes were actually written, and consider it completely normal that you get less than expected. In the Xillybus sample programs, there is a function called allwrite() which shows how to do what you actually want: Write all data before returning. Maybe try using it instead of write(). I can see that you count the number of bytes written, but you do nothing with the result.

And once you have that done, I suggest putting some printf()'s after your write, so you can see how much data should have arrived to the FPGA. That's a start.

Regards,
Eli
support
 
Posts: 613
Joined: Tue Apr 24, 2012 3:46 pm

Re: problem with user_w_write_128_wren

Postby kevin » Mon May 21, 2018 8:25 am

I have the following code segment for your reference.

Code: Select all
wr = write(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
printf("wr = %d\n", wr);
write(fdw, NULL, 0); // flush the write stream


I have the correct wr printout which is 16 (STREAM_WIDTH/NUM_OF_BITS_PER_BYTE = 128/8). This means I have written all 16 bytes to /dev/xillybus_write_128

By the way, I have also added explicit flush for the write call if you noticed it.
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby support » Mon May 21, 2018 8:33 am

Hello,

The fact that you got it right on one occasion doesn't mean it will go on that way. Seriously. The check and possibly re-write should be in the code.

Besides, did it repeat itself...? Did you write several such words?

Besides II: If you wrote 16 bytes and flushed, you should expect exactly one wr_en on the FPGA side, as viewed on the ILA. Even if you loop, it will take a long time, in FPGA terms, until the next word arrives.

Regards,
Eli
support
 
Posts: 613
Joined: Tue Apr 24, 2012 3:46 pm

Re: problem with user_w_write_128_wren

Postby kevin » Mon May 21, 2018 12:35 pm

If I uncomment the printf() at line 293 of https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754#file-host-cpp-L293 , I will have data_count_of_input_fifo equals to 5 instead of 11.

May I know why ?

Code: Select all
// g++ -g -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;

//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   yuv_result->Y = 0.299*R + 0.587*G + 0.114*B;
   yuv_result->U = 0.492*(B-yuv_result->Y);;
   yuv_result->V = 0.877*(R-yuv_result->Y);

   // https://www.pcmag.com/encyclopedia/term/55166/yuv-rgb-conversion-formulas

   return yuv_result;
}

/*
   Plain write() may not write all bytes requested in the buffer, so
   allwrite() loops until all data was indeed written, or exits in
   case of failure, except for EINTR. The way the EINTR condition is
   handled is the standard way of making sure the process can be suspended
   with CTRL-Z and then continue running properly.

   The function has no return value, because it always succeeds (or exits
   instead of returning).

   The function doesn't expect to reach EOF either.
*/

int allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
}

int main(int argc, char *argv[]) {

  int fdr, fdw, rd, wr, rd_donebytes, wr_donebytes;
  uint8_t *wr_buf, *rd_buf;
  pid_t pid;
  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }

  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);

    wr_donebytes = 0;  // this variable includes the empty 8 bits for the MSB
   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the emtpy 8 bits
   unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; printf("if_index = %d\n\r", if_index);
      

      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break;
      }*/

      wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte

        wr = allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
      printf("wr = %d\n", wr);
      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
      printf("num_of_pixels_sent = %d\n", num_of_pixels_sent);
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      wr = allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1);
      //write(fdw, NULL, 0); // flush the write stream
      printf("wr = %d\n", wr);
      break;  // finish sending all (image_width * image_height) pixels
     }

     if ((wr < 0) && (errno == EINTR))
      continue;

     if (wr <= 0) {
      perror("write() failed");
      exit(1);
     }

     wr_donebytes += wr;
   }

   sleep(1); // Let debug output drain (if used)

   close(fdw);

   return 0;
  }

  else {
    close(fdw);

   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

    rd_buf = (uint8_t *) fromlogic;
    rd_donebytes = 0;  // this variable includes the empty 8 bits for the MSB

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t *yuv_stream;

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        yuv_stream = rd_buf + num_of_pixels_received;
        //printf("before read() \n");
      rd = read(fdr, yuv_stream, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);
      
      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      //printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      /*for(yuv_stream_index = 1; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_received)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V );
         printf("yuv_stream[%d] = %d\n", yuv_stream_index+1, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U );
         printf("yuv_stream[%d] = %d\n", yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1, fromlogic[((yuv_stream_index-1)/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y );

         //break; // just to test if there is actually something being read, or returned from hardware.
      }*/

      break; // just to test if there is actually something being read, or returned from hardware.
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        yuv_stream = rd_buf + num_of_pixels_received;
        //printf("before read in else. \n");
      rd = read(fdr, yuv_stream, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }

      if ((rd < 0) && (errno == EINTR))
      continue;

      if (rd < 0) {
      perror("read() failed");
      exit(1);
      }

      if (rd == 0) {
      fprintf(stderr, "Reached read EOF!? Should never happen.\n");
      exit(0);
      }

      rd_donebytes += rd;
   }
   //printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    sleep(1); // Let debug output drain (if used)

    close(fdr);

    return 0;
  }
}


Please look at the ILA waveform trace at https://i.imgur.com/3wjyWSN.png . Note: This forum seems to limit the visible image dimension, cropping the part which the forum post could not accommodate

Image
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby kevin » Wed May 23, 2018 2:54 am

I am lucky this time. Disabling fork() makes everything normal again at least with the input FIFO data count is incrementing when it is supposed to.

May I know WHY ?
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby kevin » Thu May 24, 2018 8:58 am

Hi Mr Eli, I have fixed all the previous issues

Now, I am having problem debugging the following xillybus host cpp code https://gist.github.com/promach/9d185d35a6e6db0da10992a19c36f754 since it terminates prematurely without any error printout.

can I step through fork() or pthread_join() in gdb ? I mean two separate processes or threads. I am not sure if gdb will be of any help here. Could anyone advise ?

Code: Select all
// g++ -g -pedantic -Wall -Werror -Wextra -fsanitize=address host.cpp -o host `pkg-config --cflags --libs opencv`

#include <opencv2/core/core.hpp>
//#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>      // std::ifstream, std::ofstream
#include <string>
#include <sys/wait.h>
#include <errno.h>
#include <cmath>

using namespace cv;
using namespace std;

//#define LOOPBACK 1
#define RGB2YUV 1

unsigned int image_width;
unsigned int image_height;

const unsigned int CHNL_NUM = 3;
const unsigned int RED_CHNL = 2;
const unsigned int GREEN_CHNL = 1;
const unsigned int BLUE_CHNL = 0;

const unsigned int STREAM_WIDTH = 128;
const unsigned int NUM_OF_BITS_PER_BYTE = 8;

const unsigned int PIXEL_VALUE_RANGE = 8; // number of bits occupied by [R, G, B] or [Y, U, V] respectively (8-bit unsigned integer for each components) , https://docs.microsoft.com/en-us/windows-hardware/drivers/display/yuv-rgb-data-range-conversions
const unsigned int NUM_OF_COMPONENTS_IN_A_PIXEL = 3; // input: [R, G, B]     output:[Y, U, V]

const unsigned int PIXEL_NUM_THAT_FITS_STREAM_WIDTH = 5;  // 128-bit stream can at most fits 5 pixels ((PIXEL_NUM_THAT_FITS_STREAM_WIDTH*NUM_OF_COMPONENTS_IN_A_PIXEL*PIXEL_VALUE_RANGE) bits = 120 bits), each pixels contains R, G, B which are encoded in 8 bits for each of the three color components

struct RGB_packet{
  uint8_t R,G,B;
};

struct YUV_packet{
  uint8_t Y,U,V;
};


struct YUV_packet* rgb2yuv(struct RGB_packet rgb_input)  // convert rgb to yuv

   unsigned char R = rgb_input.R;
   unsigned char G = rgb_input.G;
   unsigned char B = rgb_input.B;

   int Y_temp, U_temp, V_temp;

   struct YUV_packet *yuv_result = (YUV_packet *)malloc(image_width * image_height * sizeof(unsigned char) * NUM_OF_COMPONENTS_IN_A_PIXEL);

   // https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
   Y_temp =  77*R + 150*G +  29*B;
   U_temp = -43*R -  84*G + 127*B;
   V_temp = 127*R - 106*G -  21*B;

   Y_temp = (Y_temp + 128) >> 8;
   U_temp = (U_temp + 128) >> 8;
   V_temp = (V_temp + 128) >> 8;

   yuv_result->Y = Y_temp;
   yuv_result->U = U_temp + 128;
   yuv_result->V = V_temp + 128;

   return yuv_result;
}

/*
   Plain write() may not write all bytes requested in the buffer, so
   allwrite() loops until all data was indeed written, or exits in
   case of failure, except for EINTR. The way the EINTR condition is
   handled is the standard way of making sure the process can be suspended
   with CTRL-Z and then continue running properly.

   The function has no return value, because it always succeeds (or exits
   instead of returning).

   The function doesn't expect to reach EOF either.
*/

int allwrite(int fd, unsigned char *buf, int len) {
  int sent = 0;
  int rc;

  while (sent < len) {
    rc = write(fd, buf + sent, len - sent);//fsync(fd);

    if ((rc < 0) && (errno == EINTR))
      continue;

    if (rc < 0) {
      perror("allwrite() failed to write");
      exit(1);
    }

    if (rc == 0) {
      fprintf(stderr, "Reached write EOF (?!)\n");
      exit(1);
    }

    sent += rc;
  }
  return sent;
}

int main(int argc, char *argv[]) {

  int fdr, fdw, rd, wr, rd_donebytes, wr_donebytes;
  uint8_t *wr_buf, *rd_buf;
  pid_t pid;
  struct RGB_packet *tologic;
  struct YUV_packet *fromlogic;

  fdr = open("/dev/xillybus_read_128", O_RDONLY);  // will change to /dev/xillybus_read_128
  fdw = open("/dev/xillybus_write_128", O_WRONLY); // will change to /dev/xillybus_write_128

  if ((fdr < 0) || (fdw < 0)) {
    perror("Failed to open Xillybus device file(s)");
    exit(1);
  }

  // READ in an image file

  String imageName( "lena512color.tiff" ); // by default

  if( argc > 1)
  {
   imageName = argv[1];
  }

  Mat image;

  image = imread( imageName, IMREAD_COLOR ); // Read the file

  if( image.empty() )                      // Check for invalid input
  {
   cout <<  "Could not open or find the image" << std::endl ;
   return -1;
  }

  else
  {
   image_width = image.size().width;
   image_height = image.size().height;
  }

  namedWindow( "Original Image", WINDOW_AUTOSIZE );
  imshow( "Original Image", image );
   
  Mat rgbchannel[CHNL_NUM];
  // The actual splitting.
  split(image, rgbchannel);
   
  namedWindow("Red", WINDOW_AUTOSIZE);
  imshow("Red", rgbchannel[RED_CHNL]);
   
  namedWindow("Green", WINDOW_AUTOSIZE);
  imshow("Green", rgbchannel[GREEN_CHNL]);
   
  namedWindow("Blue", WINDOW_AUTOSIZE);
  imshow("Blue", rgbchannel[BLUE_CHNL]);

  waitKey(0);  // see all three split channels before feeding in the channel data to xillybus/RIFFA for hardware computation

  vector<RGB_packet> vTo(image_width * image_height);  // lena.tiff is sized as 3*512*512

  tologic = vTo.data();

  if (!tologic) {
   fprintf(stderr, "Failed to allocate memory\n");
   exit(1);
  }

  for(unsigned int pixel_index = 0; pixel_index < (image_width * image_height); pixel_index++)
  {
   tologic[pixel_index].R = *(rgbchannel[RED_CHNL].data + pixel_index);
   tologic[pixel_index].G = *(rgbchannel[GREEN_CHNL].data + pixel_index);
   tologic[pixel_index].B = *(rgbchannel[BLUE_CHNL].data + pixel_index);
  }

  pid = fork();

  if (pid < 0) {
    perror("Failed to fork()");
    exit(1);
  }

  if (pid) {
   close(fdr);

    wr_donebytes = 0;  // this variable includes the empty 8 bits for the MSB
   unsigned int num_of_pixels_sent = 0; // this is actual pixels number already sent, does not include the empty 8 bits
   //unsigned int if_index = 0;
   unsigned int rgb_stream_index = 0;
   uint8_t rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

   while (num_of_pixels_sent < image_width * image_height)
   {
     if(((image_width * image_height)-num_of_pixels_sent) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
      // arrange the five pixels in the format as in https://i.imgur.com/mdJwk7J.png
      //if_index++; //printf("if_index = %d\n\r", if_index);
      //if(if_index == 3) break;

      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<((STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }

      rgb_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE)-1] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]); //break;
      }*/

      wr_buf = rgb_stream; // write() writes wr_buf in first-in-first-out order, so rgb_stream[0] will be written first into fdw as the least significant byte

        wr = allwrite(fdw, wr_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);  // this write() is 128-bits or 16 bytes which include the empty MSB 8 bits
      //printf("wr = %d\n", wr);
      num_of_pixels_sent = num_of_pixels_sent + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;      
      //printf("num_of_pixels_sent = %d\n", num_of_pixels_sent);
     }

     else  // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
      for(rgb_stream_index = 0; (rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL); rgb_stream_index=rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         rgb_stream[rgb_stream_index] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].R;
         rgb_stream[rgb_stream_index+1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].G;
         rgb_stream[rgb_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1] = tologic[(rgb_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_sent].B;
      }            

      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL + 1] = '\0';  // however, this NULL character is not sent across write()
      rgb_stream[((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL] = 0;  // remember that the eight most significant bits of the 128-bits stream are ignored by hardware logic

      /*for(unsigned int j=0; j<(((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1); j++)      
      {
         printf("rgb_stream[%d] = %d\n\r", j, rgb_stream[j]);
      }*/
      
      wr_buf = rgb_stream;  // this is a partially filled 128-bit stream (with less than 5 pixels)

      wr = allwrite(fdw, wr_buf, ((image_width * image_height)-num_of_pixels_sent)*NUM_OF_COMPONENTS_IN_A_PIXEL+1);
      //write(fdw, NULL, 0); // flush the write stream
      //printf("wr = %d\n", wr);
      break;  // finish sending all (image_width * image_height) pixels
     }

     if ((wr < 0) && (errno == EINTR))
      continue;

     if (wr <= 0) {
      perror("write() failed");
      exit(1);
     }

     wr_donebytes += wr;
   }

   //sleep(1); // Let debug output drain (if used)

   close(fdw);

   //return 0;
  }

  else {
    close(fdw);

   vector<YUV_packet> vFrom(image_width * image_height);

    fromlogic = vFrom.data();
   //printf("fromlogic[0].Y = %p \n", &fromlogic[0].Y);
    if (!fromlogic) {
      fprintf(stderr, "Failed to allocate memory\n");
      exit(1);
    }

    //rd_buf = (uint8_t *) fromlogic; // yuv_stream
    rd_donebytes = 0;  // this variable includes the empty 8 bits for the MSB

   unsigned int num_of_pixels_received = 0; // this is actual pixels number already received, does not include the empty 8 bits
   unsigned int yuv_stream_index = 0;
   uint8_t yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE + 1];  // could accomodate 5 pixels

    while (num_of_pixels_received < image_width * image_height) {

     if(((image_width * image_height)-num_of_pixels_received) >= PIXEL_NUM_THAT_FITS_STREAM_WIDTH)
     {
        //printf("before read() \n");
      rd_buf = yuv_stream;
      rd = read(fdr, rd_buf, STREAM_WIDTH/NUM_OF_BITS_PER_BYTE);

      // For every five pixels (128 bits) received from hardware logic computation, print out the YUV values of all five pixels
      for(yuv_stream_index = 0; yuv_stream_index<STREAM_WIDTH/NUM_OF_BITS_PER_BYTE; yuv_stream_index=yuv_stream_index+1)
      {
         printf("yuv_stream[%d] = %d\n", yuv_stream_index, yuv_stream[yuv_stream_index]);
      }
      yuv_stream[STREAM_WIDTH/NUM_OF_BITS_PER_BYTE] = '\0';  // this NULL character is only to act as "stop bit" for character array

      //if(num_of_pixels_received == 300) break; // just to test if there is actually something being read, or returned from hardware

      // store the calculated output YUV pixels into fromlogic such that we could reconstruct the image in its original dimension for visual display
      for(yuv_stream_index = 0; (yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1)<(STREAM_WIDTH/NUM_OF_BITS_PER_BYTE); yuv_stream_index=yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL)
      {
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].Y = yuv_stream[yuv_stream_index];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].U = yuv_stream[yuv_stream_index+1];
         fromlogic[(yuv_stream_index/NUM_OF_COMPONENTS_IN_A_PIXEL)+num_of_pixels_received].V = yuv_stream[yuv_stream_index+NUM_OF_COMPONENTS_IN_A_PIXEL-1];
      }

      num_of_pixels_received = num_of_pixels_received + PIXEL_NUM_THAT_FITS_STREAM_WIDTH;
      printf("num_of_pixels_received = %d\n\r", num_of_pixels_received);
     }     
     else // the remaining pixels do not fill all five pixel slots for a 128-bit stream
     {
        rd_buf = yuv_stream;
        printf("before read in else. \n");
      rd = read(fdr, rd_buf, image_width * image_height - num_of_pixels_received);  //  is a partially filled 128-bit stream (with less than 5 pixels)
      //printf("break in else. \n");
      break; // finish receiving all (image_width * image_height) pixels
     }

      if ((rd < 0) && (errno == EINTR))
      continue;

      if (rd < 0) {
      perror("read() failed");
      exit(1);
      }

      if (rd == 0) {
      fprintf(stderr, "Reached read EOF!? Should never happen.\n");
      exit(0);
      }

      rd_donebytes += rd;
   }
   printf("before for loop\n");
    for (unsigned int i = 0; i < (image_width * image_height); i++)  // check the perfomance of hardware with respect to software computation
   {
   #ifdef LOOPBACK
      if( (tologic[i].R != fromlogic[i].Y) ||
         (tologic[i].G != fromlogic[i].U) ||
         (tologic[i].B != fromlogic[i].V) )
   #elif RGB2YUV
       uint8_t expected_Y = rgb2yuv(tologic[i])->Y;
       uint8_t expected_U = rgb2yuv(tologic[i])->U;
       uint8_t expected_V = rgb2yuv(tologic[i])->V;

      if( (abs(expected_Y - fromlogic[i].Y) > 1) ||
          (abs(expected_U - fromlogic[i].U) > 1) ||
          (abs(expected_V - fromlogic[i].V) > 1) ) // rgb2yuv conversion hardware tolerance fails by more than 1 compared to software computation
   #endif
      {
         printf("********************************* Attention *************************************\n\r");
         printf("R:%d G:%d B:%d \n\r", tologic[i].R, tologic[i].G, tologic[i].B);
            printf("Y:%d U:%d V:%d \n\r", fromlogic[i].Y, fromlogic[i].U, fromlogic[i].V);
         printf("expected_Y:%d expected_U:%d expected_V:%d \n\r", expected_Y, expected_U, expected_V);

         break; // just for troubleshooting
         //exit(1);
      }
   }

    sleep(1); // Let debug output drain (if used)

    close(fdr);

    return 0;
  }
}
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby kevin » Thu May 24, 2018 9:44 am

What does it mean if read() does not return any value and terminate the program on its own ?

num_of_pixels_received = 140
before read()
after read()
num_of_pixels_received = 145
before read()
after read()
num_of_pixels_received = 150
before read()
after read()
num_of_pixels_received = 155
before read()
after read()
num_of_pixels_received = 160
before read()
after read()
num_of_pixels_received = 165
before read()
[Thread 0x7fffe8990700 (LWP 19198) exited]
[Thread 0x7ffff7f1b340 (LWP 19194) exited]
[Inferior 1 (process 19194) exited normally]
(gdb)
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby kevin » Fri May 25, 2018 3:31 am

This seems like this is a synchronization problem where the writing process terminates, the reading process is still running in the background.
In other words, this is a problem of achieving synchronization in full duplex cpp coding

Someone suggested the following to me :
If the pattern is "write, then wait for FPGA, then read results" you don't really need 2 or more CPUs (and don't really need 2 or more threads or processes)


So, there are three possible options/solution for this problem.

1) Use only one thread/process, but send all (image_height * image_width * NUM_OF_COLOR_COMPONENTS_IN_A_PIXEL) bytes using one single write() call since xillybus is giving us the ability for seamless DMA transfer

2) For fork(), having the writing half of your program wait on the child before exiting itself

3) For pthread, also use single write() call.

Am I missing anything important ?

I have not finished understanding http://xillybus.com/downloads/doc/xillybus_host_programming_guide_linux.pdf#page=43

it appears to me on page 40 that Xillybus intermediate FIFO could easily overflow with such huge amount of bytes from a single write() call

Could anyone comment ?
kevin
 
Posts: 40
Joined: Tue Dec 12, 2017 10:41 am

Re: problem with user_w_write_128_wren

Postby support » Fri May 25, 2018 7:15 am

Hello,

Please refer to section 6.6 of the Xillybus host application programming guide for Linux, which you've referred to above. I don't think there is any useful information in the Appendix ("Internals" part) in this context. It's all about getting the application code properly written, not looking for peculiarities.

In particular, a write() call will not make any FIFO overflow in any case, assuming that the FIFO is connected correctly to the Xillybus IP core. Or more specifically, the FIFO's full port is connected right. As shown in the demo code. This is quite trivial.

As said in section 6.6, there should be one process (or thread) for reading and one for writing. It's quite expected that the write process will finish before the read process, in particular in the presence of large DMA buffers. It recommended to flush the write buffer with a zero-length write for a synchronization effect before quitting, but even so, the FPGA still needs to do something with the data, and then the read process receives it. So it always finishes later.

Besides, if you don't have any debug output from the FPGA, there's no point for the sleep() calls.

Regards,
Eli
support
 
Posts: 613
Joined: Tue Apr 24, 2012 3:46 pm

Next

Return to Xillybus

cron