Closed ngdymx closed 3 months ago
Hello and thank you for signaling this issue!
Would you be able to provide us with your test file, as well as with the build and run commands (or the Makefile) that you used?
Hi,
Sure.
The file structure is shown below. The test.py
is in the host folder, and aie.mlir
is in the kernel folder.
test.py:
import torch
import torch.nn as nn
import sys
import math
from aie.utils.ml import DataShaper
import time
import os
import numpy as np
from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
import aie.utils.test as test_utils
import einops
torch.use_deterministic_algorithms(True)
torch.manual_seed(0)
def main(opts):
design = "repeat_buffer"
xclbin_path = opts.xclbin
insts_path = opts.instr
log_folder = "log/"
if not os.path.exists(log_folder):
os.makedirs(log_folder)
num_iter = 1
npu_time_total = 0
npu_time_min = 9999999
npu_time_max = 0
dtype_in = np.dtype("int8")
dtype_out = np.dtype("int8")
shape_in_act = (8, )
shape_out = (8, 8)
# ------------------------------------------------------
# Initialize activation, weights, scaling factor for int8 model
# ------------------------------------------------------
int_inp = torch.randint(1, 10, (8, )).type(torch.FloatTensor)
# ------------------------------------------------------
# Get device, load the xclbin & kernel and register them
# ------------------------------------------------------
app = setup_aie(
xclbin_path,
insts_path,
shape_in_act,
dtype_in,
shape_in_act,
dtype_in,
shape_out,
dtype_out,
)
# ------------------------------------------------------
# Reorder input data-layout
# ------------------------------------------------------
h = 8
golden_input = einops.repeat(int_inp, "h -> d h", d = 8).clone().contiguous()
before_input = int_inp.squeeze().data.numpy().astype(dtype_in)
before_input.tofile(
log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d"
)
print(before_input)
# ------------------------------------------------------
# Main run loop
# ------------------------------------------------------
for i in range(num_iter):
start = time.time_ns()
aie_output = execute(app, before_input, before_input)
stop = time.time_ns()
npu_time = stop - start
npu_time_total = npu_time_total + npu_time
# ------------------------------------------------------
# Reorder output data-layout
# ------------------------------------------------------
temp_out = aie_output.reshape(8, 8)
ofm_mem_fmt = temp_out
ofm_mem_fmt.tofile(
log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d"
)
ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0)
print(aie_output)
# ------------------------------------------------------
# Compare the AIE output and the golden reference
# ------------------------------------------------------
print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000)))
if np.allclose(
ofm_mem_fmt_out,
golden_input,
rtol=0,
atol=0,
):
print("\nPASS!\n")
exit(0)
else:
print("\nFailed.\n")
exit(-1)
if __name__ == "__main__":
p = test_utils.create_default_argparser()
opts = p.parse_args(sys.argv[1:])
main(opts)
Makefile
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
SRCDIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib
VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib
CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include
CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR}
CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include
CHESSCCWRAP2_FLAGS = aie2 -I ${VITIS_AIETOOLS_DIR}/include
trace_size = 8192
HOST_O_DIR := build/host
HOST_C_TARGET := host.exe
KERNEL_O_DIR := build/bitstream
KERNEL_SRCS := $(wildcard $(SRCDIR)/kernel/*.cc)
KERNEL_OBJS := $(patsubst $(SRCDIR)/kernel/%.cc, ${KERNEL_O_DIR}/%.o, $(KERNEL_SRCS))
KERNEL_HEADERS := $(wildcard $(SRCDIR)/kernel/*.h)
MLIR_O_DIR := kernel
MLIR_TARGET := ${MLIR_O_DIR}/aie.mlir
BITSTREAM_O_DIR := build/bitstream
XCLBIN_TARGET := ${BITSTREAM_O_DIR}/final.xclbin
INSTS_TARGET := ${BITSTREAM_O_DIR}/insts.txt
.PHONY: all kernel link bitstream host clean
all: ${XCLBIN_TARGET} ${INSTS_TARGET} ${HOST_C_TARGET}
clean:
-@rm -rf build
-@rm -rf log
kernel: ${KERNEL_OBJS}
link: ${MLIR_TARGET}
bitstream: ${XCLBIN_TARGET}
host: ${HOST_C_TARGET}
# Build host
${HOST_C_TARGET}: ${SRCDIR}/host/host.cpp
rm -rf ${HOST_O_DIR}
mkdir -p ${HOST_O_DIR}
cd ${HOST_O_DIR} && cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ../.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${HOST_C_TARGET} -Dsubdir=${subdir}
cd ${HOST_O_DIR} && cmake --build . --config Release
cp ${HOST_O_DIR}/${HOST_C_TARGET} ./
# Build kernels
${KERNEL_O_DIR}/%.o: ${SRCDIR}/kernel/%.cc ${KERNEL_HEADERS}
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DINT8_ACT -c $< -o ${@F}
# Build bitstream
${XCLBIN_TARGET}: ${MLIR_TARGET}
mkdir -p ${@D}
cd ${BITSTREAM_O_DIR} && aiecc.py --aie-generate-cdo --no-compile-host --basic-alloc-scheme --xclbin-name=${@F} \
--aie-generate-npu --npu-insts-name=${INSTS_TARGET:${BITSTREAM_O_DIR}/%=%} $(<:${MLIR_O_DIR}/%=../../kernel/%)
.PHONY: run
run: ${HOST_C_TARGET} ${XCLBIN_TARGET} ${INSTS_TARGET} #sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
./$< -x ${XCLBIN_TARGET} -i ${INSTS_TARGET} -k MLIR_AIE
trace: ${HOST_C_TARGET} ${XCLBIN_TARGET} ${INSTS_TARGET} # sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
./$< -x ${XCLBIN_TARGET} -i ${INSTS_TARGET} -k MLIR_AIE -t ${trace_size}
./parse_trace.py --filename trace.txt --mlir ${MLIR_TARGET} --colshift 1 > trace_mm.json
run_py: ${XCLBIN_TARGET} ${INSTS_TARGET} ${SRCDIR}/host/test.py
python3 ${SRCDIR}/host/test.py -x ${<} -i ${INSTS_TARGET} -k MLIR_AIE
CMAKELIST
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# parameters
# -DBOOST_ROOT: Path to Boost install
# -DOpenCV_DIR: Path to OpenCV install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built
# cmake needs this line
cmake_minimum_required(VERSION 3.1)
find_program(WSL NAMES powershell.exe)
if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif ()
set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
set(EDGEDETECT_HEIGHT 1080 CACHE STRING "image height")
set(TARGET_NAME test CACHE STRING "Target to be built")
SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})
if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()
project(${ProjectName})
# Find packages
find_package(Boost REQUIRED)
find_package(OpenCV REQUIRED)
message("opencv library paht: ${OpenCV_LIB_PATH}")
message("opencv libs: ${OpenCV_LIBS}")
add_executable(${currentTarget}
/home/nock8/mlir-aie/programming_examples/utils/OpenCVUtils.cpp
/home/nock8/mlir-aie/runtime_lib/test_lib/test_utils.cpp
host/host.cpp
)
target_compile_definitions(${currentTarget} PUBLIC
EDGEDETECT_WIDTH=${EDGEDETECT_WIDTH}
EDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT}
DISABLE_ABI_CHECK=1
)
target_include_directories (${currentTarget} PUBLIC
/home/nock8/mlir-aie/programming_examples/utils
/home/nock8/mlir-aie/runtime_lib/test_lib
${XRT_INC_DIR}
${OpenCV_INCLUDE_DIRS}
${Boost_INCLUDE_DIRS}
)
target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${OpenCV_LIB_PATH}
${Boost_LIBRARY_DIRS}
)
if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
${OpenCV_LIBS}
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
${OpenCV_LIBS}
)
endif()
make bitstream
is to generate xclbin
file and make run_py
is testing. Thank you very much!
Hello again! Thank you for your patience.
I tried running your design locally on the most recent version of the main
branch with the files that you have provided and the test passes for me, with the expected output. The only thing I changed was in the MLIR source file, lines 38 and 39, I changed the strides to [0, 0, 0, 1]
:
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 16][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<16xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][8, 1, 1, 2][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<2xi32>
This is also the expected format when writing the design using python bindings that will then be lowered to MLIR. Does this change work for you as well?
Please let us know if this issue persists!
Hi,
Thank you very much for your help.
I try it and it gives me an 'aiex.npu.dma_memcpy_nd' op attribute 'static_strides' failed to satisfy constraint: i64 dense array attribute with exactly 3 elements
error.
So, I think I need to update the version.
Thank you very much!
It does indeed look like a version issue, I had the inverse problem:
'aiex.npu.dma_memcpy_nd' op attribute 'static_strides' failed to satisfy constraint: i64 dense array attribute with exactly 4 elements
lol, thank you very much.
I will update the version and try it again.
Hello again,
It works, thank you very much!
By the way, I found that in the old version, it took around 1 second to run the resnet
under programming_example/ml/resnet
. Now, it only takes around 4 ms to run the same project.
May I ask how is it achieved and how to interoperate the time consumption?
Hello again,
I believe this was the result of a number of fixes and improvements on the XRT side, and not because of anything that we changed in this repo.
If everything is now working on your end is this issue alright to close?
Hi,
Yes, thank you very much!
Hi team,
My purpose is to repeat a memref<8xi8> eight times to form a memref<64xi8> on chip with objectfifo. I write a mlir code shown in the following.
In this code, the
%c8
is the replicate time, when I compile it, it gives me a "WorkerSegmentation fault" error.I tried to reduce the replicate time to 2 by replacing
%c8
with%c2
, and it works. The result is shown below:May I ask if it is a bug of the compiler?